From 96284ab295d26e00ea36e21ed31e26b39b9601d0 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 14 Mar 2016 13:52:44 +0100 Subject: [PATCH 01/48] added sgemm- and strmm-kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 14 +- kernel/power/sgemm_kernel_16x8_power8.S | 354 ++ kernel/power/sgemm_logic_16x8_power8.S | 2172 ++++++++ kernel/power/sgemm_macros_16x8_power8.S | 6145 +++++++++++++++++++++++ kernel/power/strmm_kernel_16x8_power8.S | 364 ++ kernel/power/strmm_logic_16x8_power8.S | 2969 +++++++++++ param.h | 10 +- 7 files changed, 12018 insertions(+), 10 deletions(-) create mode 100644 kernel/power/sgemm_kernel_16x8_power8.S create mode 100644 kernel/power/sgemm_logic_16x8_power8.S create mode 100644 kernel/power/sgemm_macros_16x8_power8.S create mode 100644 kernel/power/strmm_kernel_16x8_power8.S create mode 100644 kernel/power/strmm_logic_16x8_power8.S diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 760d568cd..d40b20dd8 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -3,14 +3,18 @@ #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c -STRMMKERNEL = gemm_kernel_power6.S +STRMMKERNEL = strmm_kernel_16x8_power8.S DTRMMKERNEL = dtrmm_kernel_16x4_power8.S CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S -SGEMMKERNEL = gemm_kernel_power6.S -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMKERNEL = sgemm_kernel_16x8_power8.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o @@ -146,7 +150,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SGEMVTKERNEL = ../arm/gemv_t.c #DGEMVTKERNEL = ../arm/gemv_t.c #CGEMVTKERNEL = ../arm/zgemv_t.c -#ZGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c #SSYMV_U_KERNEL = ../generic/symv_k.c diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S new file mode 100644 index 000000000..9f221301a --- /dev/null +++ b/kernel/power/sgemm_kernel_16x8_power8.S @@ -0,0 +1,354 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_SP 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA_SP 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define alpha_r vs30 +#define alpha_vr vs31 + +#define o0 0 + +#define o4 r15 +#define o12 r16 +#define o8 r17 +#define L r18 +#define T1 r19 +#define KK r20 +#define BB r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "sgemm_macros_16x8_power8.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) +#endif + + // stfd f1, ALPHA_SP + // stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, 2 + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif +#endif + + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + li PRE, 384 + li o4 , 4 + li o8 , 8 + li o12, 12 + li o16, 16 + li o32, 32 + li o48, 48 + + addi T1, SP, 300 + stfs f1, 0(T1) + stfs f1, 4(T1) + stfs f1, 8(T1) + stfs f1,12(T1) + + lxsspx vs28, 0, T1 + + xxspltw alpha_r, vs28 , 0 + lxvw4x alpha_vr, 0, T1 + + + +#include "sgemm_logic_16x8_power8.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/sgemm_logic_16x8_power8.S b/kernel/power/sgemm_logic_16x8_power8.S new file mode 100644 index 000000000..6c5a1c7ef --- /dev/null +++ b/kernel/power/sgemm_logic_16x8_power8.S @@ -0,0 +1,2172 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +**************************************************************************************/ + + + srawi. J, N, 3 + ble .LSGEMM_L8_END + +.LSGEMM_L8_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 3 + add C, C, T1 + srawi. I, M, 4 + ble .LSGEMM_L8x16_END + +.LSGEMM_L8x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L8x16_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L8x16_SUB4 + +.LSGEMM_L8x16_LOOP_START: + + dcbt AO, PRE + LOAD8x16_1 + KERNEL8x16_I1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -2 + ble .LSGEMM_L8x16_LOOP_END + + .align 5 + +.LSGEMM_L8x16_LOOP: + + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -1 + bgt .LSGEMM_L8x16_LOOP + +.LSGEMM_L8x16_LOOP_END: + + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + KERNEL8x16_1 + KERNEL8x16_E2 + + b .LSGEMM_L8x16_SUB1 + +.LSGEMM_L8x16_SUB4: + + dcbt AO, PRE + KERNEL8x16_SUBI1 + KERNEL8x16_SUB1 + dcbt AO, PRE + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + + b .LSGEMM_L8x16_SUB1 + +.LSGEMM_L8x16_SUB0: + + andi. L, K, 7 + + KERNEL8x16_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L8x16_SAVE + b .LSGEMM_L8x16_SUB2 + +.LSGEMM_L8x16_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L8x16_SAVE + +.LSGEMM_L8x16_SUB2: + + KERNEL8x16_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L8x16_SUB2 + +.LSGEMM_L8x16_SAVE: + + SAVE8x16 + + addic. I, I, -1 + bgt .LSGEMM_L8x16_BEGIN + +.LSGEMM_L8x16_END: + +.LSGEMM_L8x8_BEGIN: + + andi. T2, M, 15 + ble .LSGEMM_L8x1_END + + andi. T1, M, 8 + ble .LSGEMM_L8x8_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L8x8_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L8x8_SUB4 + +.LSGEMM_L8x8_LOOP_START: + + LOAD8x8_1 + KERNEL8x8_I1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -2 + ble .LSGEMM_L8x8_LOOP_END + + .align 5 + +.LSGEMM_L8x8_LOOP: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -1 + bgt .LSGEMM_L8x8_LOOP + +.LSGEMM_L8x8_LOOP_END: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_E2 + + b .LSGEMM_L8x8_SUB1 + +.LSGEMM_L8x8_SUB4: + + KERNEL8x8_SUBI1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + b .LSGEMM_L8x8_SUB1 + +.LSGEMM_L8x8_SUB0: + + andi. L, K, 7 + + KERNEL8x8_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L8x8_SAVE + b .LSGEMM_L8x8_SUB2 + +.LSGEMM_L8x8_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L8x8_SAVE + +.LSGEMM_L8x8_SUB2: + + KERNEL8x8_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L8x8_SUB2 + +.LSGEMM_L8x8_SAVE: + + SAVE8x8 + +.LSGEMM_L8x8_END: + +.LSGEMM_L8x4_BEGIN: + + + andi. T1, M, 4 + ble .LSGEMM_L8x4_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L8x4_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L8x4_SUB4 + +.LSGEMM_L8x4_LOOP_START: + + LOAD8x4_1 + KERNEL8x4_I1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -2 + ble .LSGEMM_L8x4_LOOP_END + + .align 5 + +.LSGEMM_L8x4_LOOP: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -1 + bgt .LSGEMM_L8x4_LOOP + +.LSGEMM_L8x4_LOOP_END: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_E2 + + b .LSGEMM_L8x4_SUB1 + +.LSGEMM_L8x4_SUB4: + + KERNEL8x4_SUBI1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + b .LSGEMM_L8x4_SUB1 + +.LSGEMM_L8x4_SUB0: + + andi. L, K, 7 + + KERNEL8x4_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L8x4_SAVE + b .LSGEMM_L8x4_SUB2 + +.LSGEMM_L8x4_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L8x4_SAVE + +.LSGEMM_L8x4_SUB2: + + KERNEL8x4_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L8x4_SUB2 + +.LSGEMM_L8x4_SAVE: + + SAVE8x4 + +.LSGEMM_L8x4_END: + +.LSGEMM_L8x2_BEGIN: + + + andi. T1, M, 2 + ble .LSGEMM_L8x2_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L8x2_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L8x2_SUB4 + +.LSGEMM_L8x2_LOOP_START: + + LOAD8x2_1 + KERNEL8x2_I1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -2 + ble .LSGEMM_L8x2_LOOP_END + + .align 5 + +.LSGEMM_L8x2_LOOP: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -1 + bgt .LSGEMM_L8x2_LOOP + +.LSGEMM_L8x2_LOOP_END: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_E2 + + b .LSGEMM_L8x2_SUB1 + +.LSGEMM_L8x2_SUB4: + + KERNEL8x2_SUBI1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + b .LSGEMM_L8x2_SUB1 + +.LSGEMM_L8x2_SUB0: + + andi. L, K, 7 + + KERNEL8x2_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L8x2_SAVE + b .LSGEMM_L8x2_SUB2 + +.LSGEMM_L8x2_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L8x2_SAVE + +.LSGEMM_L8x2_SUB2: + + KERNEL8x2_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L8x2_SUB2 + +.LSGEMM_L8x2_SAVE: + + SAVE8x2 + +.LSGEMM_L8x2_END: + +.LSGEMM_L8x1_BEGIN: + + + andi. T1, M, 1 + ble .LSGEMM_L8x1_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L8x1_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L8x1_SUB4 + +.LSGEMM_L8x1_LOOP_START: + + LOAD8x1_1 + KERNEL8x1_I1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -2 + ble .LSGEMM_L8x1_LOOP_END + + .align 5 + +.LSGEMM_L8x1_LOOP: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -1 + bgt .LSGEMM_L8x1_LOOP + +.LSGEMM_L8x1_LOOP_END: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_E2 + + b .LSGEMM_L8x1_SUB1 + +.LSGEMM_L8x1_SUB4: + + KERNEL8x1_SUBI1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + b .LSGEMM_L8x1_SUB1 + +.LSGEMM_L8x1_SUB0: + + andi. L, K, 7 + + KERNEL8x1_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L8x1_SAVE + b .LSGEMM_L8x1_SUB2 + +.LSGEMM_L8x1_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L8x1_SAVE + +.LSGEMM_L8x1_SUB2: + + KERNEL8x1_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L8x1_SUB2 + +.LSGEMM_L8x1_SAVE: + + SAVE8x1 + +.LSGEMM_L8x1_END: + + slwi T1, K, 5 + add B, B, T1 + + addic. J, J, -1 + bgt .LSGEMM_L8_BEGIN + + andi. T2, N, 7 + ble .L999 + +.LSGEMM_L8_END: + + b .LSGEMM_L4_BEGIN + +.L999_H1: + + b .L999 + +.LSGEMM_L4_BEGIN: + + andi. T1, N, 4 + ble .LSGEMM_L4_END + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + srawi. I, M, 4 + ble .LSGEMM_L4x16_END + +.LSGEMM_L4x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L4x16_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L4x16_SUB4 + +.LSGEMM_L4x16_LOOP_START: + + dcbt AO, PRE + LOAD4x16_1 + KERNEL4x16_I1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -2 + ble .LSGEMM_L4x16_LOOP_END + + .align 5 + +.LSGEMM_L4x16_LOOP: + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -1 + bgt .LSGEMM_L4x16_LOOP + +.LSGEMM_L4x16_LOOP_END: + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + KERNEL4x16_1 + KERNEL4x16_E2 + + b .LSGEMM_L4x16_SUB1 + +.LSGEMM_L4x16_SUB4: + + dcbt AO, PRE + KERNEL4x16_SUBI1 + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + b .LSGEMM_L4x16_SUB1 + +.LSGEMM_L4x16_SUB0: + + andi. L, K, 7 + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L4x16_SAVE + b .LSGEMM_L4x16_SUB2 + +.LSGEMM_L4x16_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L4x16_SAVE + +.LSGEMM_L4x16_SUB2: + + KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L4x16_SUB2 + +.LSGEMM_L4x16_SAVE: + + SAVE4x16 + + addic. I, I, -1 + bgt .LSGEMM_L4x16_BEGIN + +.LSGEMM_L4x16_END: + +.LSGEMM_L4x8_BEGIN: + + andi. T2, M, 15 + ble .LSGEMM_L4x1_END + + andi. T1, M, 8 + ble .LSGEMM_L4x8_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L4x8_SUB4 + +.LSGEMM_L4x8_LOOP_START: + + LOAD4x8_1 + KERNEL4x8_I1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -2 + ble .LSGEMM_L4x8_LOOP_END + + .align 5 + +.LSGEMM_L4x8_LOOP: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -1 + bgt .LSGEMM_L4x8_LOOP + +.LSGEMM_L4x8_LOOP_END: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b .LSGEMM_L4x8_SUB1 + +.LSGEMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b .LSGEMM_L4x8_SUB1 + +.LSGEMM_L4x8_SUB0: + + andi. L, K, 7 + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L4x8_SAVE + b .LSGEMM_L4x8_SUB2 + +.LSGEMM_L4x8_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L4x8_SAVE + +.LSGEMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L4x8_SUB2 + +.LSGEMM_L4x8_SAVE: + + SAVE4x8 + +.LSGEMM_L4x8_END: + +.LSGEMM_L4x4_BEGIN: + + + andi. T1, M, 4 + ble .LSGEMM_L4x4_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L4x4_SUB4 + +.LSGEMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble .LSGEMM_L4x4_LOOP_END + + .align 5 + +.LSGEMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt .LSGEMM_L4x4_LOOP + +.LSGEMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b .LSGEMM_L4x4_SUB1 + +.LSGEMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b .LSGEMM_L4x4_SUB1 + +.LSGEMM_L4x4_SUB0: + + andi. L, K, 7 + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L4x4_SAVE + b .LSGEMM_L4x4_SUB2 + +.LSGEMM_L4x4_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L4x4_SAVE + +.LSGEMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L4x4_SUB2 + +.LSGEMM_L4x4_SAVE: + + SAVE4x4 + +.LSGEMM_L4x4_END: + +.LSGEMM_L4x2_BEGIN: + + + andi. T1, M, 2 + ble .LSGEMM_L4x2_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L4x2_SUB4 + +.LSGEMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble .LSGEMM_L4x2_LOOP_END + + .align 5 + +.LSGEMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt .LSGEMM_L4x2_LOOP + +.LSGEMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b .LSGEMM_L4x2_SUB1 + +.LSGEMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b .LSGEMM_L4x2_SUB1 + +.LSGEMM_L4x2_SUB0: + + andi. L, K, 7 + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L4x2_SAVE + b .LSGEMM_L4x2_SUB2 + +.LSGEMM_L4x2_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L4x2_SAVE + +.LSGEMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L4x2_SUB2 + +.LSGEMM_L4x2_SAVE: + + SAVE4x2 + +.LSGEMM_L4x2_END: + +.LSGEMM_L4x1_BEGIN: + + + andi. T1, M, 1 + ble .LSGEMM_L4x1_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L4x1_SUB4 + +.LSGEMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble .LSGEMM_L4x1_LOOP_END + + .align 5 + +.LSGEMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt .LSGEMM_L4x1_LOOP + +.LSGEMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b .LSGEMM_L4x1_SUB1 + +.LSGEMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b .LSGEMM_L4x1_SUB1 + +.LSGEMM_L4x1_SUB0: + + andi. L, K, 7 + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L4x1_SAVE + b .LSGEMM_L4x1_SUB2 + +.LSGEMM_L4x1_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L4x1_SAVE + +.LSGEMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L4x1_SUB2 + +.LSGEMM_L4x1_SAVE: + + SAVE4x1 + +.LSGEMM_L4x1_END: + + slwi T1, K, 4 + add B, B, T1 + +.LSGEMM_L4_END: +.LSGEMM_L2_BEGIN: + + andi. T1, N, 2 + ble .LSGEMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 4 + ble .LSGEMM_L2x16_END + +.LSGEMM_L2x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L2x16_SUB4 + +.LSGEMM_L2x16_LOOP_START: + + dcbt AO, PRE + LOAD2x16_1 + KERNEL2x16_I1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble .LSGEMM_L2x16_LOOP_END + + .align 5 + +.LSGEMM_L2x16_LOOP: + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt .LSGEMM_L2x16_LOOP + +.LSGEMM_L2x16_LOOP_END: + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + KERNEL2x16_1 + KERNEL2x16_E2 + + b .LSGEMM_L2x16_SUB1 + +.LSGEMM_L2x16_SUB4: + + dcbt AO, PRE + KERNEL2x16_SUBI1 + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b .LSGEMM_L2x16_SUB1 + +.LSGEMM_L2x16_SUB0: + + andi. L, K, 7 + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L2x16_SAVE + b .LSGEMM_L2x16_SUB2 + +.LSGEMM_L2x16_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L2x16_SAVE + +.LSGEMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L2x16_SUB2 + +.LSGEMM_L2x16_SAVE: + + SAVE2x16 + + addic. I, I, -1 + bgt .LSGEMM_L2x16_BEGIN + +.LSGEMM_L2x16_END: + +.LSGEMM_L2x8_BEGIN: + + andi. T2, M, 15 + ble .LSGEMM_L2x1_END + + andi. T1, M, 8 + ble .LSGEMM_L2x8_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L2x8_SUB4 + +.LSGEMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble .LSGEMM_L2x8_LOOP_END + + .align 5 + +.LSGEMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt .LSGEMM_L2x8_LOOP + +.LSGEMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b .LSGEMM_L2x8_SUB1 + +.LSGEMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b .LSGEMM_L2x8_SUB1 + +.LSGEMM_L2x8_SUB0: + + andi. L, K, 7 + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L2x8_SAVE + b .LSGEMM_L2x8_SUB2 + +.LSGEMM_L2x8_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L2x8_SAVE + +.LSGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L2x8_SUB2 + +.LSGEMM_L2x8_SAVE: + + SAVE2x8 + +.LSGEMM_L2x8_END: + +.LSGEMM_L2x4_BEGIN: + + + andi. T1, M, 4 + ble .LSGEMM_L2x4_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L2x4_SUB4 + +.LSGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble .LSGEMM_L2x4_LOOP_END + + .align 5 + +.LSGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt .LSGEMM_L2x4_LOOP + +.LSGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b .LSGEMM_L2x4_SUB1 + +.LSGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b .LSGEMM_L2x4_SUB1 + +.LSGEMM_L2x4_SUB0: + + andi. L, K, 7 + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L2x4_SAVE + b .LSGEMM_L2x4_SUB2 + +.LSGEMM_L2x4_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L2x4_SAVE + +.LSGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L2x4_SUB2 + +.LSGEMM_L2x4_SAVE: + + SAVE2x4 + +.LSGEMM_L2x4_END: + +.LSGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble .LSGEMM_L2x2_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L2x2_SUB4 + +.LSGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble .LSGEMM_L2x2_LOOP_END + + .align 5 + +.LSGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt .LSGEMM_L2x2_LOOP + +.LSGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b .LSGEMM_L2x2_SUB1 + +.LSGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b .LSGEMM_L2x2_SUB1 + +.LSGEMM_L2x2_SUB0: + + andi. L, K, 7 + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L2x2_SAVE + b .LSGEMM_L2x2_SUB2 + +.LSGEMM_L2x2_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L2x2_SAVE + +.LSGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L2x2_SUB2 + +.LSGEMM_L2x2_SAVE: + + SAVE2x2 + +.LSGEMM_L2x2_END: + +.LSGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble .LSGEMM_L2x1_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L2x1_SUB4 + +.LSGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble .LSGEMM_L2x1_LOOP_END + + .align 5 + +.LSGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt .LSGEMM_L2x1_LOOP + +.LSGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b .LSGEMM_L2x1_SUB1 + +.LSGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b .LSGEMM_L2x1_SUB1 + +.LSGEMM_L2x1_SUB0: + + andi. L, K, 7 + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L2x1_SAVE + b .LSGEMM_L2x1_SUB2 + +.LSGEMM_L2x1_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L2x1_SAVE + +.LSGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L2x1_SUB2 + +.LSGEMM_L2x1_SAVE: + + SAVE2x1 + +.LSGEMM_L2x1_END: + + slwi T1, K, 3 + add B, B, T1 + +.LSGEMM_L2_END: +.LSGEMM_L1_BEGIN: + + andi. T1, N, 1 + ble .LSGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 4 + ble .LSGEMM_L1x16_END + +.LSGEMM_L1x16_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L1x16_SUB4 + +.LSGEMM_L1x16_LOOP_START: + + dcbt AO, PRE + LOAD1x16_1 + KERNEL1x16_I1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble .LSGEMM_L1x16_LOOP_END + + .align 5 + +.LSGEMM_L1x16_LOOP: + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt .LSGEMM_L1x16_LOOP + +.LSGEMM_L1x16_LOOP_END: + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + KERNEL1x16_1 + KERNEL1x16_E2 + + b .LSGEMM_L1x16_SUB1 + +.LSGEMM_L1x16_SUB4: + + dcbt AO, PRE + KERNEL1x16_SUBI1 + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b .LSGEMM_L1x16_SUB1 + +.LSGEMM_L1x16_SUB0: + + andi. L, K, 7 + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L1x16_SAVE + b .LSGEMM_L1x16_SUB2 + +.LSGEMM_L1x16_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L1x16_SAVE + +.LSGEMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L1x16_SUB2 + +.LSGEMM_L1x16_SAVE: + + SAVE1x16 + + addic. I, I, -1 + bgt .LSGEMM_L1x16_BEGIN + +.LSGEMM_L1x16_END: + +.LSGEMM_L1x8_BEGIN: + + andi. T2, M, 15 + ble .LSGEMM_L1x1_END + + andi. T1, M, 8 + ble .LSGEMM_L1x8_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L1x8_SUB4 + +.LSGEMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble .LSGEMM_L1x8_LOOP_END + + .align 5 + +.LSGEMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt .LSGEMM_L1x8_LOOP + +.LSGEMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b .LSGEMM_L1x8_SUB1 + +.LSGEMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b .LSGEMM_L1x8_SUB1 + +.LSGEMM_L1x8_SUB0: + + andi. L, K, 7 + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L1x8_SAVE + b .LSGEMM_L1x8_SUB2 + +.LSGEMM_L1x8_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L1x8_SAVE + +.LSGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L1x8_SUB2 + +.LSGEMM_L1x8_SAVE: + + SAVE1x8 + +.LSGEMM_L1x8_END: + +.LSGEMM_L1x4_BEGIN: + + + andi. T1, M, 4 + ble .LSGEMM_L1x4_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L1x4_SUB4 + +.LSGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble .LSGEMM_L1x4_LOOP_END + + .align 5 + +.LSGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt .LSGEMM_L1x4_LOOP + +.LSGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b .LSGEMM_L1x4_SUB1 + +.LSGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b .LSGEMM_L1x4_SUB1 + +.LSGEMM_L1x4_SUB0: + + andi. L, K, 7 + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L1x4_SAVE + b .LSGEMM_L1x4_SUB2 + +.LSGEMM_L1x4_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L1x4_SAVE + +.LSGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L1x4_SUB2 + +.LSGEMM_L1x4_SAVE: + + SAVE1x4 + +.LSGEMM_L1x4_END: + +.LSGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble .LSGEMM_L1x2_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L1x2_SUB4 + +.LSGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble .LSGEMM_L1x2_LOOP_END + + .align 5 + +.LSGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt .LSGEMM_L1x2_LOOP + +.LSGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b .LSGEMM_L1x2_SUB1 + +.LSGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b .LSGEMM_L1x2_SUB1 + +.LSGEMM_L1x2_SUB0: + + andi. L, K, 7 + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L1x2_SAVE + b .LSGEMM_L1x2_SUB2 + +.LSGEMM_L1x2_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L1x2_SAVE + +.LSGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L1x2_SUB2 + +.LSGEMM_L1x2_SAVE: + + SAVE1x2 + +.LSGEMM_L1x2_END: + +.LSGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble .LSGEMM_L1x1_END + mr BO, B + srawi. L, K, 3 + ble .LSGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble .LSGEMM_L1x1_SUB4 + +.LSGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble .LSGEMM_L1x1_LOOP_END + + .align 5 + +.LSGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt .LSGEMM_L1x1_LOOP + +.LSGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b .LSGEMM_L1x1_SUB1 + +.LSGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b .LSGEMM_L1x1_SUB1 + +.LSGEMM_L1x1_SUB0: + + andi. L, K, 7 + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble .LSGEMM_L1x1_SAVE + b .LSGEMM_L1x1_SUB2 + +.LSGEMM_L1x1_SUB1: + + andi. L, K, 7 + ble .LSGEMM_L1x1_SAVE + +.LSGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt .LSGEMM_L1x1_SUB2 + +.LSGEMM_L1x1_SAVE: + + SAVE1x1 + +.LSGEMM_L1x1_END: + +.LSGEMM_L1_END: diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S new file mode 100644 index 000000000..78f530cfa --- /dev/null +++ b/kernel/power/sgemm_macros_16x8_power8.S @@ -0,0 +1,6145 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +**************************************************************************************/ + +/********************************************************************************************** +* Macros for N=8 and M=16 +**********************************************************************************************/ + +.macro LOAD8x16_1 + + lxvw4x vs28, o0, BO + lxvw4x vs29, o16, BO + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + addi AO, AO, 64 + addi BO, BO, 32 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + +.endm + +.macro KERNEL8x16_I1 + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + lxvw4x vs28, o0, BO + lxvw4x vs29, o16, BO + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + xvmulsp vs48, vs0, vs12 + xvmulsp vs49, vs1, vs12 + xvmulsp vs50, vs2, vs12 + xvmulsp vs51, vs3, vs12 + + xvmulsp vs52, vs0, vs13 + xvmulsp vs53, vs1, vs13 + xvmulsp vs54, vs2, vs13 + xvmulsp vs55, vs3, vs13 + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + xvmulsp vs56, vs0, vs14 + xvmulsp vs57, vs1, vs14 + xvmulsp vs58, vs2, vs14 + xvmulsp vs59, vs3, vs14 + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + xvmulsp vs60, vs0, vs15 + xvmulsp vs61, vs1, vs15 + + addi AO, AO, 64 + addi BO, BO, 32 + + xvmulsp vs62, vs2, vs15 + xvmulsp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_1 + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + lxvw4x vs28, o0, BO + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + + lxvw4x vs29, o16, BO + + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + xvmaddasp vs48, vs0, vs12 + xvmaddasp vs49, vs1, vs12 + xvmaddasp vs50, vs2, vs12 + xvmaddasp vs51, vs3, vs12 + + xvmaddasp vs52, vs0, vs13 + xvmaddasp vs53, vs1, vs13 + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + xvmaddasp vs54, vs2, vs13 + xvmaddasp vs55, vs3, vs13 + + xvmaddasp vs56, vs0, vs14 + xvmaddasp vs57, vs1, vs14 + + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + xvmaddasp vs58, vs2, vs14 + xvmaddasp vs59, vs3, vs14 + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + + xvmaddasp vs60, vs0, vs15 + xvmaddasp vs61, vs1, vs15 + + addi AO, AO, 64 + addi BO, BO, 32 + + xvmaddasp vs62, vs2, vs15 + xvmaddasp vs63, vs3, vs15 + + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + +.endm + +.macro KERNEL8x16_2 + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + + lxvw4x vs28, o0, BO + + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + + lxvw4x vs29, o16, BO + + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + xvmaddasp vs48, vs4, vs20 + xvmaddasp vs49, vs5, vs20 + xvmaddasp vs50, vs6, vs20 + xvmaddasp vs51, vs7, vs20 + + xvmaddasp vs52, vs4, vs21 + xvmaddasp vs53, vs5, vs21 + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + xvmaddasp vs54, vs6, vs21 + xvmaddasp vs55, vs7, vs21 + + xvmaddasp vs56, vs4, vs22 + xvmaddasp vs57, vs5, vs22 + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + xvmaddasp vs58, vs6, vs22 + xvmaddasp vs59, vs7, vs22 + + xvmaddasp vs60, vs4, vs23 + xvmaddasp vs61, vs5, vs23 + + addi AO, AO, 64 + addi BO, BO, 32 + + xvmaddasp vs62, vs6, vs23 + xvmaddasp vs63, vs7, vs23 + + +.endm + +.macro KERNEL8x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + xvmaddasp vs48, vs4, vs20 + xvmaddasp vs49, vs5, vs20 + xvmaddasp vs50, vs6, vs20 + xvmaddasp vs51, vs7, vs20 + + xvmaddasp vs52, vs4, vs21 + xvmaddasp vs53, vs5, vs21 + xvmaddasp vs54, vs6, vs21 + xvmaddasp vs55, vs7, vs21 + + xvmaddasp vs56, vs4, vs22 + xvmaddasp vs57, vs5, vs22 + xvmaddasp vs58, vs6, vs22 + xvmaddasp vs59, vs7, vs22 + + xvmaddasp vs60, vs4, vs23 + xvmaddasp vs61, vs5, vs23 + xvmaddasp vs62, vs6, vs23 + xvmaddasp vs63, vs7, vs23 + + +.endm + +.macro KERNEL8x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + xvmulsp vs48, vs0, vs12 + xvmulsp vs49, vs1, vs12 + xvmulsp vs50, vs2, vs12 + xvmulsp vs51, vs3, vs12 + + xvmulsp vs52, vs0, vs13 + xvmulsp vs53, vs1, vs13 + xvmulsp vs54, vs2, vs13 + xvmulsp vs55, vs3, vs13 + + xvmulsp vs56, vs0, vs14 + xvmulsp vs57, vs1, vs14 + xvmulsp vs58, vs2, vs14 + xvmulsp vs59, vs3, vs14 + + xvmulsp vs60, vs0, vs15 + xvmulsp vs61, vs1, vs15 + xvmulsp vs62, vs2, vs15 + xvmulsp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + xvmaddasp vs48, vs0, vs12 + xvmaddasp vs49, vs1, vs12 + xvmaddasp vs50, vs2, vs12 + xvmaddasp vs51, vs3, vs12 + + xvmaddasp vs52, vs0, vs13 + xvmaddasp vs53, vs1, vs13 + xvmaddasp vs54, vs2, vs13 + xvmaddasp vs55, vs3, vs13 + + xvmaddasp vs56, vs0, vs14 + xvmaddasp vs57, vs1, vs14 + xvmaddasp vs58, vs2, vs14 + xvmaddasp vs59, vs3, vs14 + + xvmaddasp vs60, vs0, vs15 + xvmaddasp vs61, vs1, vs15 + xvmaddasp vs62, vs2, vs15 + xvmaddasp vs63, vs3, vs15 + + +.endm + +.macro SAVE8x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr + +#else + + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr + +#else + + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs48, alpha_vr + xvmulsp vs1, vs49, alpha_vr + xvmulsp vs2, vs50, alpha_vr + xvmulsp vs3, vs51, alpha_vr + +#else + + xvmaddasp vs0, vs48, alpha_vr + xvmaddasp vs1, vs49, alpha_vr + xvmaddasp vs2, vs50, alpha_vr + xvmaddasp vs3, vs51, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs52, alpha_vr + xvmulsp vs1, vs53, alpha_vr + xvmulsp vs2, vs54, alpha_vr + xvmulsp vs3, vs55, alpha_vr + +#else + + xvmaddasp vs0, vs52, alpha_vr + xvmaddasp vs1, vs53, alpha_vr + xvmaddasp vs2, vs54, alpha_vr + xvmaddasp vs3, vs55, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs56, alpha_vr + xvmulsp vs1, vs57, alpha_vr + xvmulsp vs2, vs58, alpha_vr + xvmulsp vs3, vs59, alpha_vr + +#else + + xvmaddasp vs0, vs56, alpha_vr + xvmaddasp vs1, vs57, alpha_vr + xvmaddasp vs2, vs58, alpha_vr + xvmaddasp vs3, vs59, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs60, alpha_vr + xvmulsp vs1, vs61, alpha_vr + xvmulsp vs2, vs62, alpha_vr + xvmulsp vs3, vs63, alpha_vr + +#else + + xvmaddasp vs0, vs60, alpha_vr + xvmaddasp vs1, vs61, alpha_vr + xvmaddasp vs2, vs62, alpha_vr + xvmaddasp vs3, vs63, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=8 +**********************************************************************************************/ + +.macro LOAD8x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + xvmulsp vs40, vs0, vs12 + xvmulsp vs41, vs1, vs12 + + xvmulsp vs42, vs0, vs13 + xvmulsp vs43, vs1, vs13 + + xvmulsp vs44, vs0, vs14 + xvmulsp vs45, vs1, vs14 + + xvmulsp vs46, vs0, vs15 + xvmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + xvmaddasp vs40, vs0, vs12 + xvmaddasp vs41, vs1, vs12 + + xvmaddasp vs42, vs0, vs13 + xvmaddasp vs43, vs1, vs13 + + xvmaddasp vs44, vs0, vs14 + xvmaddasp vs45, vs1, vs14 + + xvmaddasp vs46, vs0, vs15 + xvmaddasp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + xvmaddasp vs40, vs4, vs20 + xvmaddasp vs41, vs5, vs20 + + xvmaddasp vs42, vs4, vs21 + xvmaddasp vs43, vs5, vs21 + + xvmaddasp vs44, vs4, vs22 + xvmaddasp vs45, vs5, vs22 + + xvmaddasp vs46, vs4, vs23 + xvmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + xvmaddasp vs40, vs4, vs20 + xvmaddasp vs41, vs5, vs20 + + xvmaddasp vs42, vs4, vs21 + xvmaddasp vs43, vs5, vs21 + + xvmaddasp vs44, vs4, vs22 + xvmaddasp vs45, vs5, vs22 + + xvmaddasp vs46, vs4, vs23 + xvmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + xvmulsp vs40, vs0, vs12 + xvmulsp vs41, vs1, vs12 + + xvmulsp vs42, vs0, vs13 + xvmulsp vs43, vs1, vs13 + + xvmulsp vs44, vs0, vs14 + xvmulsp vs45, vs1, vs14 + + xvmulsp vs46, vs0, vs15 + xvmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + xvmaddasp vs40, vs0, vs12 + xvmaddasp vs41, vs1, vs12 + + xvmaddasp vs42, vs0, vs13 + xvmaddasp vs43, vs1, vs13 + + xvmaddasp vs44, vs0, vs14 + xvmaddasp vs45, vs1, vs14 + + xvmaddasp vs46, vs0, vs15 + xvmaddasp vs47, vs1, vs15 + + +.endm + +.macro SAVE8x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + +#else + + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs42, alpha_vr + xvmulsp vs1, vs43, alpha_vr + +#else + + xvmaddasp vs0, vs42, alpha_vr + xvmaddasp vs1, vs43, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + +#else + + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs46, alpha_vr + xvmulsp vs1, vs47, alpha_vr + +#else + + xvmaddasp vs0, vs46, alpha_vr + xvmaddasp vs1, vs47, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=4 +**********************************************************************************************/ + +.macro LOAD8x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + xvmulsp vs36, vs0, vs12 + + xvmulsp vs37, vs0, vs13 + + xvmulsp vs38, vs0, vs14 + + xvmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + xvmaddasp vs36, vs0, vs12 + + xvmaddasp vs37, vs0, vs13 + + xvmaddasp vs38, vs0, vs14 + + xvmaddasp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + xvmaddasp vs36, vs4, vs20 + + xvmaddasp vs37, vs4, vs21 + + xvmaddasp vs38, vs4, vs22 + + xvmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + xvmaddasp vs36, vs4, vs20 + + xvmaddasp vs37, vs4, vs21 + + xvmaddasp vs38, vs4, vs22 + + xvmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + xvmulsp vs36, vs0, vs12 + + xvmulsp vs37, vs0, vs13 + + xvmulsp vs38, vs0, vs14 + + xvmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + xvmaddasp vs36, vs0, vs12 + + xvmaddasp vs37, vs0, vs13 + + xvmaddasp vs38, vs0, vs14 + + xvmaddasp vs39, vs0, vs15 + + +.endm + +.macro SAVE8x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs34, alpha_vr + +#else + + xvmaddasp vs0, vs34, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs37, alpha_vr + +#else + + xvmaddasp vs0, vs37, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs38, alpha_vr + +#else + + xvmaddasp vs0, vs38, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=2 +**********************************************************************************************/ + +.macro LOAD8x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + xsmulsp vs36, vs0, vs10 + xsmulsp vs37, vs1, vs10 + + xsmulsp vs38, vs0, vs11 + xsmulsp vs39, vs1, vs11 + + xsmulsp vs40, vs0, vs12 + xsmulsp vs41, vs1, vs12 + + xsmulsp vs42, vs0, vs13 + xsmulsp vs43, vs1, vs13 + + xsmulsp vs44, vs0, vs14 + xsmulsp vs45, vs1, vs14 + + xsmulsp vs46, vs0, vs15 + xsmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + xsmaddasp vs36, vs0, vs10 + xsmaddasp vs37, vs1, vs10 + + xsmaddasp vs38, vs0, vs11 + xsmaddasp vs39, vs1, vs11 + + xsmaddasp vs40, vs0, vs12 + xsmaddasp vs41, vs1, vs12 + + xsmaddasp vs42, vs0, vs13 + xsmaddasp vs43, vs1, vs13 + + xsmaddasp vs44, vs0, vs14 + xsmaddasp vs45, vs1, vs14 + + xsmaddasp vs46, vs0, vs15 + xsmaddasp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + xsmaddasp vs36, vs4, vs18 + xsmaddasp vs37, vs5, vs18 + + xsmaddasp vs38, vs4, vs19 + xsmaddasp vs39, vs5, vs19 + + xsmaddasp vs40, vs4, vs20 + xsmaddasp vs41, vs5, vs20 + + xsmaddasp vs42, vs4, vs21 + xsmaddasp vs43, vs5, vs21 + + xsmaddasp vs44, vs4, vs22 + xsmaddasp vs45, vs5, vs22 + + xsmaddasp vs46, vs4, vs23 + xsmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x2_E2 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + xsmaddasp vs36, vs4, vs18 + xsmaddasp vs37, vs5, vs18 + + xsmaddasp vs38, vs4, vs19 + xsmaddasp vs39, vs5, vs19 + + xsmaddasp vs40, vs4, vs20 + xsmaddasp vs41, vs5, vs20 + + xsmaddasp vs42, vs4, vs21 + xsmaddasp vs43, vs5, vs21 + + xsmaddasp vs44, vs4, vs22 + xsmaddasp vs45, vs5, vs22 + + xsmaddasp vs46, vs4, vs23 + xsmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + xsmulsp vs36, vs0, vs10 + xsmulsp vs37, vs1, vs10 + + xsmulsp vs38, vs0, vs11 + xsmulsp vs39, vs1, vs11 + + xsmulsp vs40, vs0, vs12 + xsmulsp vs41, vs1, vs12 + + xsmulsp vs42, vs0, vs13 + xsmulsp vs43, vs1, vs13 + + xsmulsp vs44, vs0, vs14 + xsmulsp vs45, vs1, vs14 + + xsmulsp vs46, vs0, vs15 + xsmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + xsmaddasp vs36, vs0, vs10 + xsmaddasp vs37, vs1, vs10 + + xsmaddasp vs38, vs0, vs11 + xsmaddasp vs39, vs1, vs11 + + xsmaddasp vs40, vs0, vs12 + xsmaddasp vs41, vs1, vs12 + + xsmaddasp vs42, vs0, vs13 + xsmaddasp vs43, vs1, vs13 + + xsmaddasp vs44, vs0, vs14 + xsmaddasp vs45, vs1, vs14 + + xsmaddasp vs46, vs0, vs15 + xsmaddasp vs47, vs1, vs15 + + +.endm + +.macro SAVE8x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + xsmulsp vs1, vs33, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + xsmaddasp vs1, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs34, alpha_r + xsmulsp vs1, vs35, alpha_r + +#else + + xsmaddasp vs0, vs34, alpha_r + xsmaddasp vs1, vs35, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs36, alpha_r + xsmulsp vs1, vs37, alpha_r + +#else + + xsmaddasp vs0, vs36, alpha_r + xsmaddasp vs1, vs37, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs38, alpha_r + xsmulsp vs1, vs39, alpha_r + +#else + + xsmaddasp vs0, vs38, alpha_r + xsmaddasp vs1, vs39, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs40, alpha_r + xsmulsp vs1, vs41, alpha_r + +#else + + xsmaddasp vs0, vs40, alpha_r + xsmaddasp vs1, vs41, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs42, alpha_r + xsmulsp vs1, vs43, alpha_r + +#else + + xsmaddasp vs0, vs42, alpha_r + xsmaddasp vs1, vs43, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs44, alpha_r + xsmulsp vs1, vs45, alpha_r + +#else + + xsmaddasp vs0, vs44, alpha_r + xsmaddasp vs1, vs45, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs46, alpha_r + xsmulsp vs1, vs47, alpha_r + +#else + + xsmaddasp vs0, vs46, alpha_r + xsmaddasp vs1, vs47, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=1 +**********************************************************************************************/ + +.macro LOAD8x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + xsmulsp vs34, vs0, vs10 + + xsmulsp vs35, vs0, vs11 + + xsmulsp vs36, vs0, vs12 + + xsmulsp vs37, vs0, vs13 + + xsmulsp vs38, vs0, vs14 + + xsmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + xsmaddasp vs34, vs0, vs10 + + xsmaddasp vs35, vs0, vs11 + + xsmaddasp vs36, vs0, vs12 + + xsmaddasp vs37, vs0, vs13 + + xsmaddasp vs38, vs0, vs14 + + xsmaddasp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + xsmaddasp vs34, vs4, vs18 + + xsmaddasp vs35, vs4, vs19 + + xsmaddasp vs36, vs4, vs20 + + xsmaddasp vs37, vs4, vs21 + + xsmaddasp vs38, vs4, vs22 + + xsmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x1_E2 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + xsmaddasp vs34, vs4, vs18 + + xsmaddasp vs35, vs4, vs19 + + xsmaddasp vs36, vs4, vs20 + + xsmaddasp vs37, vs4, vs21 + + xsmaddasp vs38, vs4, vs22 + + xsmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + xsmulsp vs34, vs0, vs10 + + xsmulsp vs35, vs0, vs11 + + xsmulsp vs36, vs0, vs12 + + xsmulsp vs37, vs0, vs13 + + xsmulsp vs38, vs0, vs14 + + xsmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + xsmaddasp vs34, vs0, vs10 + + xsmaddasp vs35, vs0, vs11 + + xsmaddasp vs36, vs0, vs12 + + xsmaddasp vs37, vs0, vs13 + + xsmaddasp vs38, vs0, vs14 + + xsmaddasp vs39, vs0, vs15 + + +.endm + +.macro SAVE8x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs33, alpha_r + +#else + + xsmaddasp vs0, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs34, alpha_r + +#else + + xsmaddasp vs0, vs34, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs35, alpha_r + +#else + + xsmaddasp vs0, vs35, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs36, alpha_r + +#else + + xsmaddasp vs0, vs36, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs37, alpha_r + +#else + + xsmaddasp vs0, vs37, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs38, alpha_r + +#else + + xsmaddasp vs0, vs38, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs39, alpha_r + +#else + + xsmaddasp vs0, vs39, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro LOAD4x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + +.endm + +.macro KERNEL4x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + +.endm + +.macro KERNEL4x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + +.endm + +.macro SAVE4x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr + +#else + + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr + +#else + + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + +.endm + +.macro SAVE4x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs34, alpha_vr + +#else + + xvmaddasp vs0, vs34, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro LOAD4x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + xsmulsp vs36, vs0, vs10 + xsmulsp vs37, vs1, vs10 + + xsmulsp vs38, vs0, vs11 + xsmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + xsmaddasp vs36, vs0, vs10 + xsmaddasp vs37, vs1, vs10 + + xsmaddasp vs38, vs0, vs11 + xsmaddasp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + xsmaddasp vs36, vs4, vs18 + xsmaddasp vs37, vs5, vs18 + + xsmaddasp vs38, vs4, vs19 + xsmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x2_E2 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + xsmaddasp vs36, vs4, vs18 + xsmaddasp vs37, vs5, vs18 + + xsmaddasp vs38, vs4, vs19 + xsmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + xsmulsp vs36, vs0, vs10 + xsmulsp vs37, vs1, vs10 + + xsmulsp vs38, vs0, vs11 + xsmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + xsmaddasp vs36, vs0, vs10 + xsmaddasp vs37, vs1, vs10 + + xsmaddasp vs38, vs0, vs11 + xsmaddasp vs39, vs1, vs11 + + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + xsmulsp vs1, vs33, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + xsmaddasp vs1, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs34, alpha_r + xsmulsp vs1, vs35, alpha_r + +#else + + xsmaddasp vs0, vs34, alpha_r + xsmaddasp vs1, vs35, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs36, alpha_r + xsmulsp vs1, vs37, alpha_r + +#else + + xsmaddasp vs0, vs36, alpha_r + xsmaddasp vs1, vs37, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs38, alpha_r + xsmulsp vs1, vs39, alpha_r + +#else + + xsmaddasp vs0, vs38, alpha_r + xsmaddasp vs1, vs39, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro LOAD4x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + xsmulsp vs34, vs0, vs10 + + xsmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + xsmaddasp vs34, vs0, vs10 + + xsmaddasp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + xsmaddasp vs34, vs4, vs18 + + xsmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + xsmaddasp vs34, vs4, vs18 + + xsmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + xsmulsp vs34, vs0, vs10 + + xsmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + xsmaddasp vs34, vs0, vs10 + + xsmaddasp vs35, vs0, vs11 + + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs33, alpha_r + +#else + + xsmaddasp vs0, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs34, alpha_r + +#else + + xsmaddasp vs0, vs34, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs35, alpha_r + +#else + + xsmaddasp vs0, vs35, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +.macro LOAD2x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + +.endm + +.macro KERNEL2x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + +.endm + +.macro SAVE2x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr + +#else + + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x2_E2 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + xsmaddasp vs34, vs4, vs17 + xsmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + xsmulsp vs34, vs0, vs9 + xsmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + xsmaddasp vs34, vs0, vs9 + xsmaddasp vs35, vs1, vs9 + + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + xsmulsp vs1, vs33, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + xsmaddasp vs1, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs34, alpha_r + xsmulsp vs1, vs35, alpha_r + +#else + + xsmaddasp vs0, vs34, alpha_r + xsmaddasp vs1, vs35, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddasp vs32, vs4, vs16 + + xsmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 + + xsmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 + + xsmaddasp vs33, vs0, vs9 + + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs33, alpha_r + +#else + + xsmaddasp vs0, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +.macro LOAD1x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + +.endm + +.macro KERNEL1x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + +.endm + +.macro SAVE1x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xvmulsp vs0, vs32, alpha_vr + +#else + + xvmaddasp vs0, vs32, alpha_vr + +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x2_E2 + + + xsmaddasp vs32, vs4, vs16 + xsmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmulsp vs32, vs0, vs8 + xsmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs0, vs8 + xsmaddasp vs33, vs1, vs8 + + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + xsmulsp vs1, vs33, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + xsmaddasp vs1, vs33, alpha_r + +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddasp vs32, vs0, vs8 + + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + + xsmulsp vs0, vs32, alpha_r + +#else + + xsmaddasp vs0, vs32, alpha_r + +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S new file mode 100644 index 000000000..5b1c5ca6b --- /dev/null +++ b/kernel/power/strmm_kernel_16x8_power8.S @@ -0,0 +1,364 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 320 +#define ALPHA_SP 296(SP) +#define FZERO 304(SP) +#else +#define STACKSIZE 240 +#define ALPHA_SP 224(SP) +#define FZERO 232(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r8 +#define B r9 +#define C r10 +#define LDC r7 +#define OFFSET r6 +#else +#define A r7 +#define B r8 +#define C r9 +#define LDC r10 +#define OFFSET r6 +#endif +#endif + +#define alpha_r vs30 +#define alpha_vr vs31 + +#define o0 0 + +#define o12 r14 +#define o4 r15 +#define K1 r16 +#define o8 r17 +#define L r18 +#define T1 r19 +#define KK r20 +#define KKK 21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T2 r31 + +#include "sgemm_macros_16x8_power8.S" + + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) +#endif + + // stfd f1, ALPHA_SP + // stw r0, FZERO + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + + slwi LDC, LDC, BASE_SHIFT + +#if defined(TRMMKERNEL) +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif +#endif + + mr KK, OFFSET +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, KK +#endif + + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + li PRE, 256 + li o4 , 4 + li o8 , 8 + li o12, 12 + li o16, 16 + li o32, 32 + li o48, 48 + + addi T1, SP, 300 + stfs f1, 0(T1) + stfs f1, 4(T1) + stfs f1, 8(T1) + stfs f1,12(T1) + + lxsspx vs28, 0, T1 + + xxspltw alpha_r, vs28 , 0 + lxvw4x alpha_vr, 0, T1 + + + +#include "strmm_logic_16x8_power8.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/strmm_logic_16x8_power8.S b/kernel/power/strmm_logic_16x8_power8.S new file mode 100644 index 000000000..0d6d04858 --- /dev/null +++ b/kernel/power/strmm_logic_16x8_power8.S @@ -0,0 +1,2969 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +**************************************************************************************/ + + + + srawi. J, N, 3 + ble .LSTRMM_L8_END + +.LSTRMM_L8_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 3 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble .LSTRMM_L8x16_END + +.LSTRMM_L8x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L8x16_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L8x16_SUB4 + +.LSTRMM_L8x16_LOOP_START: + + dcbt AO, PRE + LOAD8x16_1 + dcbt AO, PRE + KERNEL8x16_I1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -2 + ble .LSTRMM_L8x16_LOOP_END + + .align 5 + +.LSTRMM_L8x16_LOOP: + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + addic. L, L, -1 + bgt .LSTRMM_L8x16_LOOP + +.LSTRMM_L8x16_LOOP_END: + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + + dcbt AO, PRE + KERNEL8x16_1 + dcbt AO, PRE + KERNEL8x16_2 + dcbt AO, PRE + KERNEL8x16_1 + KERNEL8x16_E2 + + b .LSTRMM_L8x16_SUB1 + +.LSTRMM_L8x16_SUB4: + + dcbt AO, PRE + KERNEL8x16_SUBI1 + dcbt AO, PRE + KERNEL8x16_SUB1 + dcbt AO, PRE + KERNEL8x16_SUB1 + dcbt AO, PRE + KERNEL8x16_SUB1 + + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + KERNEL8x16_SUB1 + + b .LSTRMM_L8x16_SUB1 + +.LSTRMM_L8x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x16_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L8x16_SAVE + b .LSTRMM_L8x16_SUB2 + +.LSTRMM_L8x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L8x16_SAVE + +.LSTRMM_L8x16_SUB2: + + KERNEL8x16_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L8x16_SUB2 + +.LSTRMM_L8x16_SAVE: + + SAVE8x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LSTRMM_L8x16_BEGIN + +.LSTRMM_L8x16_END: + +.LSTRMM_L8x8_BEGIN: + andi. T2, M, 15 + ble .LSTRMM_L8x1_END + + andi. T1, M, 8 + ble .LSTRMM_L8x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L8x8_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L8x8_SUB4 + +.LSTRMM_L8x8_LOOP_START: + + LOAD8x8_1 + KERNEL8x8_I1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -2 + ble .LSTRMM_L8x8_LOOP_END + + .align 5 + +.LSTRMM_L8x8_LOOP: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + addic. L, L, -1 + bgt .LSTRMM_L8x8_LOOP + +.LSTRMM_L8x8_LOOP_END: + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_2 + + KERNEL8x8_1 + KERNEL8x8_2 + KERNEL8x8_1 + KERNEL8x8_E2 + + b .LSTRMM_L8x8_SUB1 + +.LSTRMM_L8x8_SUB4: + + KERNEL8x8_SUBI1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + KERNEL8x8_SUB1 + + b .LSTRMM_L8x8_SUB1 + +.LSTRMM_L8x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x8_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L8x8_SAVE + b .LSTRMM_L8x8_SUB2 + +.LSTRMM_L8x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L8x8_SAVE + +.LSTRMM_L8x8_SUB2: + + KERNEL8x8_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L8x8_SUB2 + +.LSTRMM_L8x8_SAVE: + + SAVE8x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +.LSTRMM_L8x8_END: + +.LSTRMM_L8x4_BEGIN: + + andi. T1, M, 4 + ble .LSTRMM_L8x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L8x4_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L8x4_SUB4 + +.LSTRMM_L8x4_LOOP_START: + + LOAD8x4_1 + KERNEL8x4_I1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -2 + ble .LSTRMM_L8x4_LOOP_END + + .align 5 + +.LSTRMM_L8x4_LOOP: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + addic. L, L, -1 + bgt .LSTRMM_L8x4_LOOP + +.LSTRMM_L8x4_LOOP_END: + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_2 + + KERNEL8x4_1 + KERNEL8x4_2 + KERNEL8x4_1 + KERNEL8x4_E2 + + b .LSTRMM_L8x4_SUB1 + +.LSTRMM_L8x4_SUB4: + + KERNEL8x4_SUBI1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + KERNEL8x4_SUB1 + + b .LSTRMM_L8x4_SUB1 + +.LSTRMM_L8x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x4_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L8x4_SAVE + b .LSTRMM_L8x4_SUB2 + +.LSTRMM_L8x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L8x4_SAVE + +.LSTRMM_L8x4_SUB2: + + KERNEL8x4_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L8x4_SUB2 + +.LSTRMM_L8x4_SAVE: + + SAVE8x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LSTRMM_L8x4_END: + +.LSTRMM_L8x2_BEGIN: + + andi. T1, M, 2 + ble .LSTRMM_L8x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L8x2_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L8x2_SUB4 + +.LSTRMM_L8x2_LOOP_START: + + LOAD8x2_1 + KERNEL8x2_I1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -2 + ble .LSTRMM_L8x2_LOOP_END + + .align 5 + +.LSTRMM_L8x2_LOOP: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + addic. L, L, -1 + bgt .LSTRMM_L8x2_LOOP + +.LSTRMM_L8x2_LOOP_END: + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_2 + + KERNEL8x2_1 + KERNEL8x2_2 + KERNEL8x2_1 + KERNEL8x2_E2 + + b .LSTRMM_L8x2_SUB1 + +.LSTRMM_L8x2_SUB4: + + KERNEL8x2_SUBI1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + KERNEL8x2_SUB1 + + b .LSTRMM_L8x2_SUB1 + +.LSTRMM_L8x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x2_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L8x2_SAVE + b .LSTRMM_L8x2_SUB2 + +.LSTRMM_L8x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L8x2_SAVE + +.LSTRMM_L8x2_SUB2: + + KERNEL8x2_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L8x2_SUB2 + +.LSTRMM_L8x2_SAVE: + + SAVE8x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LSTRMM_L8x2_END: + +.LSTRMM_L8x1_BEGIN: + + andi. T1, M, 1 + ble .LSTRMM_L8x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 8 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L8x1_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L8x1_SUB4 + +.LSTRMM_L8x1_LOOP_START: + + LOAD8x1_1 + KERNEL8x1_I1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -2 + ble .LSTRMM_L8x1_LOOP_END + + .align 5 + +.LSTRMM_L8x1_LOOP: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + addic. L, L, -1 + bgt .LSTRMM_L8x1_LOOP + +.LSTRMM_L8x1_LOOP_END: + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_2 + + KERNEL8x1_1 + KERNEL8x1_2 + KERNEL8x1_1 + KERNEL8x1_E2 + + b .LSTRMM_L8x1_SUB1 + +.LSTRMM_L8x1_SUB4: + + KERNEL8x1_SUBI1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + KERNEL8x1_SUB1 + + b .LSTRMM_L8x1_SUB1 + +.LSTRMM_L8x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL8x1_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L8x1_SAVE + b .LSTRMM_L8x1_SUB2 + +.LSTRMM_L8x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L8x1_SAVE + +.LSTRMM_L8x1_SUB2: + + KERNEL8x1_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L8x1_SUB2 + +.LSTRMM_L8x1_SAVE: + + SAVE8x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LSTRMM_L8x1_END: + + slwi T1, K, 5 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 8 // KK += Number of values in B +#endif + + + addic. J, J, -1 + bgt .LSTRMM_L8_BEGIN + + andi. T2, N, 7 + ble .L999 + +.LSTRMM_L8_END: + + b .LSTRMM_L4_BEGIN + +.L999_H1: + + b .L999 + +.LSTRMM_L4_BEGIN: + + andi. T1, N, 4 + ble .LSTRMM_L4_END + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble .LSTRMM_L4x16_END + +.LSTRMM_L4x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L4x16_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L4x16_SUB4 + +.LSTRMM_L4x16_LOOP_START: + + dcbt AO, PRE + LOAD4x16_1 + dcbt AO, PRE + KERNEL4x16_I1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -2 + ble .LSTRMM_L4x16_LOOP_END + + .align 5 + +.LSTRMM_L4x16_LOOP: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + addic. L, L, -1 + bgt .LSTRMM_L4x16_LOOP + +.LSTRMM_L4x16_LOOP_END: + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + + dcbt AO, PRE + KERNEL4x16_1 + dcbt AO, PRE + KERNEL4x16_2 + dcbt AO, PRE + KERNEL4x16_1 + KERNEL4x16_E2 + + b .LSTRMM_L4x16_SUB1 + +.LSTRMM_L4x16_SUB4: + + dcbt AO, PRE + KERNEL4x16_SUBI1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + dcbt AO, PRE + KERNEL4x16_SUB1 + + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + KERNEL4x16_SUB1 + + b .LSTRMM_L4x16_SUB1 + +.LSTRMM_L4x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x16_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L4x16_SAVE + b .LSTRMM_L4x16_SUB2 + +.LSTRMM_L4x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L4x16_SAVE + +.LSTRMM_L4x16_SUB2: + + KERNEL4x16_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L4x16_SUB2 + +.LSTRMM_L4x16_SAVE: + + SAVE4x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LSTRMM_L4x16_BEGIN + +.LSTRMM_L4x16_END: + +.LSTRMM_L4x8_BEGIN: + andi. T2, M, 15 + ble .LSTRMM_L4x1_END + + andi. T1, M, 8 + ble .LSTRMM_L4x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L4x8_SUB4 + +.LSTRMM_L4x8_LOOP_START: + + LOAD4x8_1 + KERNEL4x8_I1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -2 + ble .LSTRMM_L4x8_LOOP_END + + .align 5 + +.LSTRMM_L4x8_LOOP: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -1 + bgt .LSTRMM_L4x8_LOOP + +.LSTRMM_L4x8_LOOP_END: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b .LSTRMM_L4x8_SUB1 + +.LSTRMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b .LSTRMM_L4x8_SUB1 + +.LSTRMM_L4x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L4x8_SAVE + b .LSTRMM_L4x8_SUB2 + +.LSTRMM_L4x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L4x8_SAVE + +.LSTRMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L4x8_SUB2 + +.LSTRMM_L4x8_SAVE: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +.LSTRMM_L4x8_END: + +.LSTRMM_L4x4_BEGIN: + + andi. T1, M, 4 + ble .LSTRMM_L4x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L4x4_SUB4 + +.LSTRMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble .LSTRMM_L4x4_LOOP_END + + .align 5 + +.LSTRMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt .LSTRMM_L4x4_LOOP + +.LSTRMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b .LSTRMM_L4x4_SUB1 + +.LSTRMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b .LSTRMM_L4x4_SUB1 + +.LSTRMM_L4x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L4x4_SAVE + b .LSTRMM_L4x4_SUB2 + +.LSTRMM_L4x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L4x4_SAVE + +.LSTRMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L4x4_SUB2 + +.LSTRMM_L4x4_SAVE: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LSTRMM_L4x4_END: + +.LSTRMM_L4x2_BEGIN: + + andi. T1, M, 2 + ble .LSTRMM_L4x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L4x2_SUB4 + +.LSTRMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble .LSTRMM_L4x2_LOOP_END + + .align 5 + +.LSTRMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt .LSTRMM_L4x2_LOOP + +.LSTRMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b .LSTRMM_L4x2_SUB1 + +.LSTRMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b .LSTRMM_L4x2_SUB1 + +.LSTRMM_L4x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L4x2_SAVE + b .LSTRMM_L4x2_SUB2 + +.LSTRMM_L4x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L4x2_SAVE + +.LSTRMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L4x2_SUB2 + +.LSTRMM_L4x2_SAVE: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LSTRMM_L4x2_END: + +.LSTRMM_L4x1_BEGIN: + + andi. T1, M, 1 + ble .LSTRMM_L4x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L4x1_SUB4 + +.LSTRMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble .LSTRMM_L4x1_LOOP_END + + .align 5 + +.LSTRMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt .LSTRMM_L4x1_LOOP + +.LSTRMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b .LSTRMM_L4x1_SUB1 + +.LSTRMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b .LSTRMM_L4x1_SUB1 + +.LSTRMM_L4x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L4x1_SAVE + b .LSTRMM_L4x1_SUB2 + +.LSTRMM_L4x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L4x1_SAVE + +.LSTRMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L4x1_SUB2 + +.LSTRMM_L4x1_SAVE: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LSTRMM_L4x1_END: + + slwi T1, K, 4 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 4 // KK += Number of values in B +#endif + + +.LSTRMM_L4_END: +.LSTRMM_L2_BEGIN: + + andi. T1, N, 2 + ble .LSTRMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble .LSTRMM_L2x16_END + +.LSTRMM_L2x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L2x16_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L2x16_SUB4 + +.LSTRMM_L2x16_LOOP_START: + + dcbt AO, PRE + LOAD2x16_1 + dcbt AO, PRE + KERNEL2x16_I1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -2 + ble .LSTRMM_L2x16_LOOP_END + + .align 5 + +.LSTRMM_L2x16_LOOP: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + addic. L, L, -1 + bgt .LSTRMM_L2x16_LOOP + +.LSTRMM_L2x16_LOOP_END: + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + + dcbt AO, PRE + KERNEL2x16_1 + dcbt AO, PRE + KERNEL2x16_2 + dcbt AO, PRE + KERNEL2x16_1 + KERNEL2x16_E2 + + b .LSTRMM_L2x16_SUB1 + +.LSTRMM_L2x16_SUB4: + + dcbt AO, PRE + KERNEL2x16_SUBI1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + dcbt AO, PRE + KERNEL2x16_SUB1 + + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + KERNEL2x16_SUB1 + + b .LSTRMM_L2x16_SUB1 + +.LSTRMM_L2x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x16_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L2x16_SAVE + b .LSTRMM_L2x16_SUB2 + +.LSTRMM_L2x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L2x16_SAVE + +.LSTRMM_L2x16_SUB2: + + KERNEL2x16_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L2x16_SUB2 + +.LSTRMM_L2x16_SAVE: + + SAVE2x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LSTRMM_L2x16_BEGIN + +.LSTRMM_L2x16_END: + +.LSTRMM_L2x8_BEGIN: + andi. T2, M, 15 + ble .LSTRMM_L2x1_END + + andi. T1, M, 8 + ble .LSTRMM_L2x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L2x8_SUB4 + +.LSTRMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble .LSTRMM_L2x8_LOOP_END + + .align 5 + +.LSTRMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt .LSTRMM_L2x8_LOOP + +.LSTRMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b .LSTRMM_L2x8_SUB1 + +.LSTRMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b .LSTRMM_L2x8_SUB1 + +.LSTRMM_L2x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L2x8_SAVE + b .LSTRMM_L2x8_SUB2 + +.LSTRMM_L2x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L2x8_SAVE + +.LSTRMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L2x8_SUB2 + +.LSTRMM_L2x8_SAVE: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +.LSTRMM_L2x8_END: + +.LSTRMM_L2x4_BEGIN: + + andi. T1, M, 4 + ble .LSTRMM_L2x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L2x4_SUB4 + +.LSTRMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble .LSTRMM_L2x4_LOOP_END + + .align 5 + +.LSTRMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt .LSTRMM_L2x4_LOOP + +.LSTRMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b .LSTRMM_L2x4_SUB1 + +.LSTRMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b .LSTRMM_L2x4_SUB1 + +.LSTRMM_L2x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L2x4_SAVE + b .LSTRMM_L2x4_SUB2 + +.LSTRMM_L2x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L2x4_SAVE + +.LSTRMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L2x4_SUB2 + +.LSTRMM_L2x4_SAVE: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LSTRMM_L2x4_END: + +.LSTRMM_L2x2_BEGIN: + + andi. T1, M, 2 + ble .LSTRMM_L2x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L2x2_SUB4 + +.LSTRMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble .LSTRMM_L2x2_LOOP_END + + .align 5 + +.LSTRMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt .LSTRMM_L2x2_LOOP + +.LSTRMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b .LSTRMM_L2x2_SUB1 + +.LSTRMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b .LSTRMM_L2x2_SUB1 + +.LSTRMM_L2x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L2x2_SAVE + b .LSTRMM_L2x2_SUB2 + +.LSTRMM_L2x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L2x2_SAVE + +.LSTRMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L2x2_SUB2 + +.LSTRMM_L2x2_SAVE: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LSTRMM_L2x2_END: + +.LSTRMM_L2x1_BEGIN: + + andi. T1, M, 1 + ble .LSTRMM_L2x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L2x1_SUB4 + +.LSTRMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble .LSTRMM_L2x1_LOOP_END + + .align 5 + +.LSTRMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt .LSTRMM_L2x1_LOOP + +.LSTRMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b .LSTRMM_L2x1_SUB1 + +.LSTRMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b .LSTRMM_L2x1_SUB1 + +.LSTRMM_L2x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L2x1_SAVE + b .LSTRMM_L2x1_SUB2 + +.LSTRMM_L2x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L2x1_SAVE + +.LSTRMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L2x1_SUB2 + +.LSTRMM_L2x1_SAVE: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LSTRMM_L2x1_END: + + slwi T1, K, 3 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 2 // KK += Number of values in B +#endif + + +.LSTRMM_L2_END: +.LSTRMM_L1_BEGIN: + + andi. T1, N, 1 + ble .LSTRMM_L1_END + mr CO, C + mr AO, A + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 4 + ble .LSTRMM_L1x16_END + +.LSTRMM_L1x16_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 16 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L1x16_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L1x16_SUB4 + +.LSTRMM_L1x16_LOOP_START: + + dcbt AO, PRE + LOAD1x16_1 + dcbt AO, PRE + KERNEL1x16_I1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -2 + ble .LSTRMM_L1x16_LOOP_END + + .align 5 + +.LSTRMM_L1x16_LOOP: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + addic. L, L, -1 + bgt .LSTRMM_L1x16_LOOP + +.LSTRMM_L1x16_LOOP_END: + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + + dcbt AO, PRE + KERNEL1x16_1 + dcbt AO, PRE + KERNEL1x16_2 + dcbt AO, PRE + KERNEL1x16_1 + KERNEL1x16_E2 + + b .LSTRMM_L1x16_SUB1 + +.LSTRMM_L1x16_SUB4: + + dcbt AO, PRE + KERNEL1x16_SUBI1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + dcbt AO, PRE + KERNEL1x16_SUB1 + + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + KERNEL1x16_SUB1 + + b .LSTRMM_L1x16_SUB1 + +.LSTRMM_L1x16_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x16_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L1x16_SAVE + b .LSTRMM_L1x16_SUB2 + +.LSTRMM_L1x16_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L1x16_SAVE + +.LSTRMM_L1x16_SUB2: + + KERNEL1x16_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L1x16_SUB2 + +.LSTRMM_L1x16_SAVE: + + SAVE1x16 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 16 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LSTRMM_L1x16_BEGIN + +.LSTRMM_L1x16_END: + +.LSTRMM_L1x8_BEGIN: + andi. T2, M, 15 + ble .LSTRMM_L1x1_END + + andi. T1, M, 8 + ble .LSTRMM_L1x8_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L1x8_SUB4 + +.LSTRMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble .LSTRMM_L1x8_LOOP_END + + .align 5 + +.LSTRMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt .LSTRMM_L1x8_LOOP + +.LSTRMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b .LSTRMM_L1x8_SUB1 + +.LSTRMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b .LSTRMM_L1x8_SUB1 + +.LSTRMM_L1x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L1x8_SAVE + b .LSTRMM_L1x8_SUB2 + +.LSTRMM_L1x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L1x8_SAVE + +.LSTRMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L1x8_SUB2 + +.LSTRMM_L1x8_SAVE: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + +.LSTRMM_L1x8_END: + +.LSTRMM_L1x4_BEGIN: + + andi. T1, M, 4 + ble .LSTRMM_L1x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L1x4_SUB4 + +.LSTRMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble .LSTRMM_L1x4_LOOP_END + + .align 5 + +.LSTRMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt .LSTRMM_L1x4_LOOP + +.LSTRMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b .LSTRMM_L1x4_SUB1 + +.LSTRMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b .LSTRMM_L1x4_SUB1 + +.LSTRMM_L1x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L1x4_SAVE + b .LSTRMM_L1x4_SUB2 + +.LSTRMM_L1x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L1x4_SAVE + +.LSTRMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L1x4_SUB2 + +.LSTRMM_L1x4_SAVE: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LSTRMM_L1x4_END: + +.LSTRMM_L1x2_BEGIN: + + andi. T1, M, 2 + ble .LSTRMM_L1x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L1x2_SUB4 + +.LSTRMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble .LSTRMM_L1x2_LOOP_END + + .align 5 + +.LSTRMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt .LSTRMM_L1x2_LOOP + +.LSTRMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b .LSTRMM_L1x2_SUB1 + +.LSTRMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b .LSTRMM_L1x2_SUB1 + +.LSTRMM_L1x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L1x2_SAVE + b .LSTRMM_L1x2_SUB2 + +.LSTRMM_L1x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L1x2_SAVE + +.LSTRMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L1x2_SUB2 + +.LSTRMM_L1x2_SAVE: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LSTRMM_L1x2_END: + +.LSTRMM_L1x1_BEGIN: + + andi. T1, M, 1 + ble .LSTRMM_L1x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 2 // Number of values in B shifted + slwi T2, KK, 2 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LSTRMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble .LSTRMM_L1x1_SUB4 + +.LSTRMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble .LSTRMM_L1x1_LOOP_END + + .align 5 + +.LSTRMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt .LSTRMM_L1x1_LOOP + +.LSTRMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b .LSTRMM_L1x1_SUB1 + +.LSTRMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b .LSTRMM_L1x1_SUB1 + +.LSTRMM_L1x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble .LSTRMM_L1x1_SAVE + b .LSTRMM_L1x1_SUB2 + +.LSTRMM_L1x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LSTRMM_L1x1_SAVE + +.LSTRMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt .LSTRMM_L1x1_SUB2 + +.LSTRMM_L1x1_SAVE: + + SAVE1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 2 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 2 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LSTRMM_L1x1_END: + +#if !defined(LEFT) + addi KK, KK, 1 // KK += Number of values in B +#endif + + +.LSTRMM_L1_END: diff --git a/param.h b/param.h index 31125d8e4..f5d1ab2ea 100644 --- a/param.h +++ b/param.h @@ -1961,15 +1961,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(POWER8) -#define SNUMOPT 4 +#define SNUMOPT 16 #define DNUMOPT 8 #define GEMM_DEFAULT_OFFSET_A 384 #define GEMM_DEFAULT_OFFSET_B 1024 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 -#define SGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 @@ -1977,12 +1977,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 992 +#define SGEMM_DEFAULT_P 960 #define DGEMM_DEFAULT_P 480 #define CGEMM_DEFAULT_P 488 #define ZGEMM_DEFAULT_P 240 -#define SGEMM_DEFAULT_Q 504 +#define SGEMM_DEFAULT_Q 720 #define DGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 400 #define ZGEMM_DEFAULT_Q 360 From dcd15b546c14d06b70721fe1a08d43ceb61e6b6f Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 14 Mar 2016 14:36:59 +0100 Subject: [PATCH 02/48] BUGFIX: KERNEL.POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index d40b20dd8..f8be1d40f 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -150,7 +150,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SGEMVTKERNEL = ../arm/gemv_t.c #DGEMVTKERNEL = ../arm/gemv_t.c #CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = zgemv_t_4.c +#ZGEMVTKERNEL = zgemv_t_4.c #SSYMV_U_KERNEL = ../generic/symv_k.c From ec4390a967cb57eddff39f1b0bf6f3e45a5387a0 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 15 Mar 2016 14:52:01 -0400 Subject: [PATCH 03/48] Bump devlop version to 0.2.17.dev. --- CMakeLists.txt | 2 +- Makefile.rule | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 58b3971e6..ae7b973bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4) project(OpenBLAS) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 2) -set(OpenBLAS_PATCH_VERSION 16) +set(OpenBLAS_PATCH_VERSION 17.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") enable_language(ASM) diff --git a/Makefile.rule b/Makefile.rule index fe4219aab..20036d559 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.16 +VERSION = 0.2.17.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library From 5c658f8746835ea8e0b22829ed049888cbd6fe7d Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 18 Mar 2016 08:17:25 +0100 Subject: [PATCH 04/48] add optimized cgemm- and ctrmm-kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 12 +- kernel/power/cgemm_kernel_8x4_power8.S | 375 ++ kernel/power/cgemm_logic_8x4_power8.S | 1342 +++++ kernel/power/cgemm_macros_8x4_power8.S | 6713 ++++++++++++++++++++++++ kernel/power/ctrmm_kernel_8x4_power8.S | 385 ++ kernel/power/ctrmm_logic_8x4_power8.S | 1756 +++++++ param.h | 9 +- 7 files changed, 10584 insertions(+), 8 deletions(-) create mode 100644 kernel/power/cgemm_kernel_8x4_power8.S create mode 100644 kernel/power/cgemm_logic_8x4_power8.S create mode 100644 kernel/power/cgemm_macros_8x4_power8.S create mode 100644 kernel/power/ctrmm_kernel_8x4_power8.S create mode 100644 kernel/power/ctrmm_logic_8x4_power8.S diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index f8be1d40f..eaa9f26ed 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -5,7 +5,7 @@ STRMMKERNEL = strmm_kernel_16x8_power8.S DTRMMKERNEL = dtrmm_kernel_16x4_power8.S -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +CTRMMKERNEL = ctrmm_kernel_8x4_power8.S ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S SGEMMKERNEL = sgemm_kernel_16x8_power8.S @@ -28,11 +28,15 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMKERNEL = cgemm_kernel_8x4_power8.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPYOBJ = cgemm_oncopy.o CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S new file mode 100644 index 000000000..f732c8132 --- /dev/null +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -0,0 +1,375 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 400 +#define ALPHA_R_SP 296(SP) +#define ALPHA_I_SP 304(SP) +#define FZERO 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R_SP 224(SP) +#define ALPHA_I_SP 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 + +#define TBUFFER r14 +#define L r15 +#define o12 r16 +#define o4 r17 +#define T2 r19 +#define KK r20 +#define o8 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o16 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) +#endif + + stfs f1, ALPHA_R_SP + stfs f2, ALPHA_I_SP + // stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) +#else + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + +#include "cgemm_macros_8x4_power8.S" + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 256 + li o4 , 4 + li o8 , 8 + li o12 , 12 + li o16 , 16 + li o32 , 32 + li o48 , 48 + addi TBUFFER, SP, 360 + + +#ifdef __64BIT__ + addi T1 , SP, 296 +#else + addi T1 , SP, 224 +#endif + + lxsspx alpha_r, 0, T1 + lxsspx alpha_i, o8, T1 + + .align 5 + +#include "cgemm_logic_8x4_power8.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/cgemm_logic_8x4_power8.S b/kernel/power/cgemm_logic_8x4_power8.S new file mode 100644 index 000000000..51a063126 --- /dev/null +++ b/kernel/power/cgemm_logic_8x4_power8.S @@ -0,0 +1,1342 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. J, N, 2 + ble .LCGEMM_L4_END + +.LCGEMM_L4_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + srawi. I, M, 3 + ble .LCGEMM_L4x8_END + +.LCGEMM_L4x8_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L4x8_SUB4 + +.LCGEMM_L4x8_LOOP_START: + + dcbt AO, PRE + LOAD4x8_1 + KERNEL4x8_I1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + + addic. L, L, -2 + ble .LCGEMM_L4x8_LOOP_END + + .align 5 + +.LCGEMM_L4x8_LOOP: + + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + + addic. L, L, -1 + bgt .LCGEMM_L4x8_LOOP + +.LCGEMM_L4x8_LOOP_END: + + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + + KERNEL4x8_1 + dcbt AO, PRE + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b .LCGEMM_L4x8_SUB1 + +.LCGEMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b .LCGEMM_L4x8_SUB1 + +.LCGEMM_L4x8_SUB0: + + andi. L, K, 7 + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L4x8_SAVE + b .LCGEMM_L4x8_SUB2 + +.LCGEMM_L4x8_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L4x8_SAVE + +.LCGEMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L4x8_SUB2 + +.LCGEMM_L4x8_SAVE: + + SAVE4x8 + + addic. I, I, -1 + bgt .LCGEMM_L4x8_BEGIN + +.LCGEMM_L4x8_END: + +.LCGEMM_L4x4_BEGIN: + + andi. T2, M, 7 + ble .LCGEMM_L4x1_END + + andi. T1, M, 4 + ble .LCGEMM_L4x4_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L4x4_SUB4 + +.LCGEMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble .LCGEMM_L4x4_LOOP_END + + .align 5 + +.LCGEMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt .LCGEMM_L4x4_LOOP + +.LCGEMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b .LCGEMM_L4x4_SUB1 + +.LCGEMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b .LCGEMM_L4x4_SUB1 + +.LCGEMM_L4x4_SUB0: + + andi. L, K, 7 + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L4x4_SAVE + b .LCGEMM_L4x4_SUB2 + +.LCGEMM_L4x4_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L4x4_SAVE + +.LCGEMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L4x4_SUB2 + +.LCGEMM_L4x4_SAVE: + + SAVE4x4 + +.LCGEMM_L4x4_END: + +.LCGEMM_L4x2_BEGIN: + + + andi. T1, M, 2 + ble .LCGEMM_L4x2_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L4x2_SUB4 + +.LCGEMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble .LCGEMM_L4x2_LOOP_END + + .align 5 + +.LCGEMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt .LCGEMM_L4x2_LOOP + +.LCGEMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b .LCGEMM_L4x2_SUB1 + +.LCGEMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b .LCGEMM_L4x2_SUB1 + +.LCGEMM_L4x2_SUB0: + + andi. L, K, 7 + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L4x2_SAVE + b .LCGEMM_L4x2_SUB2 + +.LCGEMM_L4x2_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L4x2_SAVE + +.LCGEMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L4x2_SUB2 + +.LCGEMM_L4x2_SAVE: + + SAVE4x2 + +.LCGEMM_L4x2_END: + +.LCGEMM_L4x1_BEGIN: + + + andi. T1, M, 1 + ble .LCGEMM_L4x1_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L4x1_SUB4 + +.LCGEMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble .LCGEMM_L4x1_LOOP_END + + .align 5 + +.LCGEMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt .LCGEMM_L4x1_LOOP + +.LCGEMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b .LCGEMM_L4x1_SUB1 + +.LCGEMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b .LCGEMM_L4x1_SUB1 + +.LCGEMM_L4x1_SUB0: + + andi. L, K, 7 + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L4x1_SAVE + b .LCGEMM_L4x1_SUB2 + +.LCGEMM_L4x1_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L4x1_SAVE + +.LCGEMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L4x1_SUB2 + +.LCGEMM_L4x1_SAVE: + + SAVE4x1 + +.LCGEMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 + + addic. J, J, -1 + bgt .LCGEMM_L4_BEGIN + + andi. T2, N, 3 + ble .L999_H2 + +.LCGEMM_L4_END: + + b .LCGEMM_L2_BEGIN + +.L999_H1: + + b .L999_H2 + +.LCGEMM_L2_BEGIN: + + andi. T1, N, 2 + ble .LCGEMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + srawi. I, M, 3 + ble .LCGEMM_L2x8_END + +.LCGEMM_L2x8_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L2x8_SUB4 + +.LCGEMM_L2x8_LOOP_START: + + dcbt AO, PRE + LOAD2x8_1 + KERNEL2x8_I1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -2 + ble .LCGEMM_L2x8_LOOP_END + + .align 5 + +.LCGEMM_L2x8_LOOP: + + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + addic. L, L, -1 + bgt .LCGEMM_L2x8_LOOP + +.LCGEMM_L2x8_LOOP_END: + + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + + KERNEL2x8_1 + dcbt AO, PRE + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b .LCGEMM_L2x8_SUB1 + +.LCGEMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b .LCGEMM_L2x8_SUB1 + +.LCGEMM_L2x8_SUB0: + + andi. L, K, 7 + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L2x8_SAVE + b .LCGEMM_L2x8_SUB2 + +.LCGEMM_L2x8_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L2x8_SAVE + +.LCGEMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L2x8_SUB2 + +.LCGEMM_L2x8_SAVE: + + SAVE2x8 + + addic. I, I, -1 + bgt .LCGEMM_L2x8_BEGIN + +.LCGEMM_L2x8_END: + +.LCGEMM_L2x4_BEGIN: + + andi. T2, M, 7 + ble .LCGEMM_L2x1_END + + andi. T1, M, 4 + ble .LCGEMM_L2x4_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L2x4_SUB4 + +.LCGEMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble .LCGEMM_L2x4_LOOP_END + + .align 5 + +.LCGEMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt .LCGEMM_L2x4_LOOP + +.LCGEMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b .LCGEMM_L2x4_SUB1 + +.LCGEMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b .LCGEMM_L2x4_SUB1 + +.LCGEMM_L2x4_SUB0: + + andi. L, K, 7 + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L2x4_SAVE + b .LCGEMM_L2x4_SUB2 + +.LCGEMM_L2x4_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L2x4_SAVE + +.LCGEMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L2x4_SUB2 + +.LCGEMM_L2x4_SAVE: + + SAVE2x4 + +.LCGEMM_L2x4_END: + +.LCGEMM_L2x2_BEGIN: + + + andi. T1, M, 2 + ble .LCGEMM_L2x2_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L2x2_SUB4 + +.LCGEMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble .LCGEMM_L2x2_LOOP_END + + .align 5 + +.LCGEMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt .LCGEMM_L2x2_LOOP + +.LCGEMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b .LCGEMM_L2x2_SUB1 + +.LCGEMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b .LCGEMM_L2x2_SUB1 + +.LCGEMM_L2x2_SUB0: + + andi. L, K, 7 + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L2x2_SAVE + b .LCGEMM_L2x2_SUB2 + +.LCGEMM_L2x2_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L2x2_SAVE + +.LCGEMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L2x2_SUB2 + +.LCGEMM_L2x2_SAVE: + + SAVE2x2 + +.LCGEMM_L2x2_END: + +.LCGEMM_L2x1_BEGIN: + + + andi. T1, M, 1 + ble .LCGEMM_L2x1_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L2x1_SUB4 + +.LCGEMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble .LCGEMM_L2x1_LOOP_END + + .align 5 + +.LCGEMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt .LCGEMM_L2x1_LOOP + +.LCGEMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b .LCGEMM_L2x1_SUB1 + +.LCGEMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b .LCGEMM_L2x1_SUB1 + +.LCGEMM_L2x1_SUB0: + + andi. L, K, 7 + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L2x1_SAVE + b .LCGEMM_L2x1_SUB2 + +.LCGEMM_L2x1_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L2x1_SAVE + +.LCGEMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L2x1_SUB2 + +.LCGEMM_L2x1_SAVE: + + SAVE2x1 + +.LCGEMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 + +.LCGEMM_L2_END: + + b .LCGEMM_L1_BEGIN + +.L999_H2: + + b .L999 + +.LCGEMM_L1_BEGIN: + + andi. T1, N, 1 + ble .LCGEMM_L1_END + mr CO, C + mr AO, A + srawi. I, M, 3 + ble .LCGEMM_L1x8_END + +.LCGEMM_L1x8_BEGIN: + + + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L1x8_SUB4 + +.LCGEMM_L1x8_LOOP_START: + + dcbt AO, PRE + LOAD1x8_1 + KERNEL1x8_I1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -2 + ble .LCGEMM_L1x8_LOOP_END + + .align 5 + +.LCGEMM_L1x8_LOOP: + + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + addic. L, L, -1 + bgt .LCGEMM_L1x8_LOOP + +.LCGEMM_L1x8_LOOP_END: + + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + + KERNEL1x8_1 + dcbt AO, PRE + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b .LCGEMM_L1x8_SUB1 + +.LCGEMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b .LCGEMM_L1x8_SUB1 + +.LCGEMM_L1x8_SUB0: + + andi. L, K, 7 + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L1x8_SAVE + b .LCGEMM_L1x8_SUB2 + +.LCGEMM_L1x8_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L1x8_SAVE + +.LCGEMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L1x8_SUB2 + +.LCGEMM_L1x8_SAVE: + + SAVE1x8 + + addic. I, I, -1 + bgt .LCGEMM_L1x8_BEGIN + +.LCGEMM_L1x8_END: + +.LCGEMM_L1x4_BEGIN: + + andi. T2, M, 7 + ble .LCGEMM_L1x1_END + + andi. T1, M, 4 + ble .LCGEMM_L1x4_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L1x4_SUB4 + +.LCGEMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble .LCGEMM_L1x4_LOOP_END + + .align 5 + +.LCGEMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt .LCGEMM_L1x4_LOOP + +.LCGEMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b .LCGEMM_L1x4_SUB1 + +.LCGEMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b .LCGEMM_L1x4_SUB1 + +.LCGEMM_L1x4_SUB0: + + andi. L, K, 7 + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L1x4_SAVE + b .LCGEMM_L1x4_SUB2 + +.LCGEMM_L1x4_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L1x4_SAVE + +.LCGEMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L1x4_SUB2 + +.LCGEMM_L1x4_SAVE: + + SAVE1x4 + +.LCGEMM_L1x4_END: + +.LCGEMM_L1x2_BEGIN: + + + andi. T1, M, 2 + ble .LCGEMM_L1x2_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L1x2_SUB4 + +.LCGEMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble .LCGEMM_L1x2_LOOP_END + + .align 5 + +.LCGEMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt .LCGEMM_L1x2_LOOP + +.LCGEMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b .LCGEMM_L1x2_SUB1 + +.LCGEMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b .LCGEMM_L1x2_SUB1 + +.LCGEMM_L1x2_SUB0: + + andi. L, K, 7 + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L1x2_SAVE + b .LCGEMM_L1x2_SUB2 + +.LCGEMM_L1x2_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L1x2_SAVE + +.LCGEMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L1x2_SUB2 + +.LCGEMM_L1x2_SAVE: + + SAVE1x2 + +.LCGEMM_L1x2_END: + +.LCGEMM_L1x1_BEGIN: + + + andi. T1, M, 1 + ble .LCGEMM_L1x1_END + mr BO, B + srawi. L, K, 3 + ble .LCGEMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble .LCGEMM_L1x1_SUB4 + +.LCGEMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble .LCGEMM_L1x1_LOOP_END + + .align 5 + +.LCGEMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt .LCGEMM_L1x1_LOOP + +.LCGEMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b .LCGEMM_L1x1_SUB1 + +.LCGEMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b .LCGEMM_L1x1_SUB1 + +.LCGEMM_L1x1_SUB0: + + andi. L, K, 7 + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble .LCGEMM_L1x1_SAVE + b .LCGEMM_L1x1_SUB2 + +.LCGEMM_L1x1_SUB1: + + andi. L, K, 7 + ble .LCGEMM_L1x1_SAVE + +.LCGEMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt .LCGEMM_L1x1_SUB2 + +.LCGEMM_L1x1_SAVE: + + SAVE1x1 + +.LCGEMM_L1x1_END: + +.LCGEMM_L1_END: diff --git a/kernel/power/cgemm_macros_8x4_power8.S b/kernel/power/cgemm_macros_8x4_power8.S new file mode 100644 index 000000000..2085d3764 --- /dev/null +++ b/kernel/power/cgemm_macros_8x4_power8.S @@ -0,0 +1,6713 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define XSFADD_R1 xsaddsp + #define XSFADD_R2 xssubsp + #define XSFADD_I1 xsaddsp + #define XSFADD_I2 xsaddsp + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + #define XSFADD_R1 xsaddsp + #define XSFADD_R2 xsaddsp + #define XSFADD_I1 xssubsp + #define XSFADD_I2 xsaddsp + +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + + #define XSFADD_R1 xsaddsp + #define XSFADD_R2 xsaddsp + #define XSFADD_I1 xsaddsp + #define XSFADD_I2 xssubsp + +#else // CC || CR || RC || RR + + #define XSFADD_R1 xsaddsp + #define XSFADD_R2 xssubsp + #define XSFADD_I1 xssubsp + #define XSFADD_I2 xssubsp + +#endif + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_1 + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs4, o0, AO // load a0, a1 + + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs5, o16, AO // load a2, a3 + + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + + lxvw4x vs6, o32, AO // load a4, a5 + lxvw4x vs7, o48, AO // load a6, a7 + + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + addi BO, BO, 32 + xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + addi AO, AO, 64 + xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + +.endm + +.macro KERNEL4x8_2 + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs0, o0, AO // load a0, a1 + + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs1, o16, AO // load a2, a3 + + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + lxvw4x vs2, o32, AO // load a4, a5 + lxvw4x vs3, o48, AO // load a6, a7 + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + addi AO, AO, 64 + xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i + addi BO, BO, 32 + xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x8 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs40, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs41, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs42, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs43, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs44, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs45, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs46, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs47, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs48, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs49, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs50, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs51, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs52, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs53, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs54, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs55, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs56, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs57, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs58, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs59, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs60, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs61, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs62, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs63, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x4 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs40, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs41, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs42, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs43, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs44, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs45, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs46, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs47, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro LOAD4x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x2 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro LOAD4x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs20, o0, T1 // load b2_r + lxsspx vs21, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs22, o0, T1 // load b3_r + lxsspx vs23, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 // a0_r*b0_r + xsmulsp vs33, vs1, vs9 // a0_i*b0_i + xsmulsp vs34, vs0, vs9 // a0_r*b0_i + xsmulsp vs35, vs1, vs8 // a0_i*b0_r + + xsmulsp vs36, vs0, vs10 // a0_r*b1_r + xsmulsp vs37, vs1, vs11 // a0_i*b1_i + xsmulsp vs38, vs0, vs11 // a0_r*b1_i + xsmulsp vs39, vs1, vs10 // a0_i*b1_r + + xsmulsp vs40, vs0, vs12 // a0_r*b2_r + xsmulsp vs41, vs1, vs13 // a0_i*b2_i + xsmulsp vs42, vs0, vs13 // a0_r*b2_i + xsmulsp vs43, vs1, vs12 // a0_i*b2_r + + xsmulsp vs44, vs0, vs14 // a0_r*b3_r + xsmulsp vs45, vs1, vs15 // a0_i*b3_i + xsmulsp vs46, vs0, vs15 // a0_r*b3_i + xsmulsp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs20, o0, T1 // load b2_r + lxsspx vs21, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs22, o0, T1 // load b3_r + lxsspx vs23, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 // a0_r*b0_r + xsmaddasp vs33, vs1, vs9 // a0_i*b0_i + xsmaddasp vs34, vs0, vs9 // a0_r*b0_i + xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddasp vs36, vs0, vs10 // a0_r*b1_r + xsmaddasp vs37, vs1, vs11 // a0_i*b1_i + xsmaddasp vs38, vs0, vs11 // a0_r*b1_i + xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + + xsmaddasp vs40, vs0, vs12 // a0_r*b2_r + xsmaddasp vs41, vs1, vs13 // a0_i*b2_i + xsmaddasp vs42, vs0, vs13 // a0_r*b2_i + xsmaddasp vs43, vs1, vs12 // a0_i*b2_r + + xsmaddasp vs44, vs0, vs14 // a0_r*b3_r + xsmaddasp vs45, vs1, vs15 // a0_i*b3_i + xsmaddasp vs46, vs0, vs15 // a0_r*b3_i + xsmaddasp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmaddasp vs32, vs4, vs16 // a4_r*b0_r + xsmaddasp vs33, vs5, vs17 // a4_i*b0_i + xsmaddasp vs34, vs4, vs17 // a4_r*b0_i + xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddasp vs36, vs4, vs18 // a4_r*b1_r + xsmaddasp vs37, vs5, vs19 // a4_i*b1_i + xsmaddasp vs38, vs4, vs19 // a4_r*b1_i + xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + + xsmaddasp vs40, vs4, vs20 // a4_r*b2_r + xsmaddasp vs41, vs5, vs21 // a4_i*b2_i + xsmaddasp vs42, vs4, vs21 // a4_r*b2_i + xsmaddasp vs43, vs5, vs20 // a4_i*b2_r + + xsmaddasp vs44, vs4, vs22 // a4_r*b3_r + xsmaddasp vs45, vs5, vs23 // a4_i*b3_i + xsmaddasp vs46, vs4, vs23 // a4_r*b3_i + xsmaddasp vs47, vs5, vs22 // a4_i*b3_r + + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddasp vs32, vs4, vs16 // a4_r*b0_r + xsmaddasp vs33, vs5, vs17 // a4_i*b0_i + xsmaddasp vs34, vs4, vs17 // a4_r*b0_i + xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddasp vs36, vs4, vs18 // a4_r*b1_r + xsmaddasp vs37, vs5, vs19 // a4_i*b1_i + xsmaddasp vs38, vs4, vs19 // a4_r*b1_i + xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + + xsmaddasp vs40, vs4, vs20 // a4_r*b2_r + xsmaddasp vs41, vs5, vs21 // a4_i*b2_i + xsmaddasp vs42, vs4, vs21 // a4_r*b2_i + xsmaddasp vs43, vs5, vs20 // a4_i*b2_r + + xsmaddasp vs44, vs4, vs22 // a4_r*b3_r + xsmaddasp vs45, vs5, vs23 // a4_i*b3_i + xsmaddasp vs46, vs4, vs23 // a4_r*b3_i + xsmaddasp vs47, vs5, vs22 // a4_i*b3_r + + +.endm + +.macro KERNEL4x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmulsp vs32, vs0, vs8 // a0_r*b0_r + xsmulsp vs33, vs1, vs9 // a0_i*b0_i + xsmulsp vs34, vs0, vs9 // a0_r*b0_i + xsmulsp vs35, vs1, vs8 // a0_i*b0_r + + xsmulsp vs36, vs0, vs10 // a0_r*b1_r + xsmulsp vs37, vs1, vs11 // a0_i*b1_i + xsmulsp vs38, vs0, vs11 // a0_r*b1_i + xsmulsp vs39, vs1, vs10 // a0_i*b1_r + + xsmulsp vs40, vs0, vs12 // a0_r*b2_r + xsmulsp vs41, vs1, vs13 // a0_i*b2_i + xsmulsp vs42, vs0, vs13 // a0_r*b2_i + xsmulsp vs43, vs1, vs12 // a0_i*b2_r + + xsmulsp vs44, vs0, vs14 // a0_r*b3_r + xsmulsp vs45, vs1, vs15 // a0_i*b3_i + xsmulsp vs46, vs0, vs15 // a0_r*b3_i + xsmulsp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmaddasp vs32, vs0, vs8 // a0_r*b0_r + xsmaddasp vs33, vs1, vs9 // a0_i*b0_i + xsmaddasp vs34, vs0, vs9 // a0_r*b0_i + xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddasp vs36, vs0, vs10 // a0_r*b1_r + xsmaddasp vs37, vs1, vs11 // a0_i*b1_i + xsmaddasp vs38, vs0, vs11 // a0_r*b1_i + xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + + xsmaddasp vs40, vs0, vs12 // a0_r*b2_r + xsmaddasp vs41, vs1, vs13 // a0_i*b2_i + xsmaddasp vs42, vs0, vs13 // a0_r*b2_i + xsmaddasp vs43, vs1, vs12 // a0_i*b2_r + + xsmaddasp vs44, vs0, vs14 // a0_r*b3_r + xsmaddasp vs45, vs1, vs15 // a0_i*b3_i + xsmaddasp vs46, vs0, vs15 // a0_r*b3_i + xsmaddasp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro SAVE4x1 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsaddsp vs0, vs0, vs20 + xsaddsp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsaddsp vs0, vs0, vs20 + xsaddsp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs40 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs43 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs41 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs42 // add a0_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsaddsp vs0, vs0, vs20 + xsaddsp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs44 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs47 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs45 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs46 // add a0_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsaddsp vs0, vs0, vs20 + xsaddsp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x8 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs40, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs41, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs42, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs43, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs44, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs45, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs46, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs47, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x4 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x2 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 // a0_r*b0_r + xsmulsp vs33, vs1, vs9 // a0_i*b0_i + xsmulsp vs34, vs0, vs9 // a0_r*b0_i + xsmulsp vs35, vs1, vs8 // a0_i*b0_r + + xsmulsp vs36, vs0, vs10 // a0_r*b1_r + xsmulsp vs37, vs1, vs11 // a0_i*b1_i + xsmulsp vs38, vs0, vs11 // a0_r*b1_i + xsmulsp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 // a0_r*b0_r + xsmaddasp vs33, vs1, vs9 // a0_i*b0_i + xsmaddasp vs34, vs0, vs9 // a0_r*b0_i + xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddasp vs36, vs0, vs10 // a0_r*b1_r + xsmaddasp vs37, vs1, vs11 // a0_i*b1_i + xsmaddasp vs38, vs0, vs11 // a0_r*b1_i + xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmaddasp vs32, vs4, vs16 // a4_r*b0_r + xsmaddasp vs33, vs5, vs17 // a4_i*b0_i + xsmaddasp vs34, vs4, vs17 // a4_r*b0_i + xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddasp vs36, vs4, vs18 // a4_r*b1_r + xsmaddasp vs37, vs5, vs19 // a4_i*b1_i + xsmaddasp vs38, vs4, vs19 // a4_r*b1_i + xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddasp vs32, vs4, vs16 // a4_r*b0_r + xsmaddasp vs33, vs5, vs17 // a4_i*b0_i + xsmaddasp vs34, vs4, vs17 // a4_r*b0_i + xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddasp vs36, vs4, vs18 // a4_r*b1_r + xsmaddasp vs37, vs5, vs19 // a4_i*b1_i + xsmaddasp vs38, vs4, vs19 // a4_r*b1_i + xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + + +.endm + +.macro KERNEL2x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmulsp vs32, vs0, vs8 // a0_r*b0_r + xsmulsp vs33, vs1, vs9 // a0_i*b0_i + xsmulsp vs34, vs0, vs9 // a0_r*b0_i + xsmulsp vs35, vs1, vs8 // a0_i*b0_r + + xsmulsp vs36, vs0, vs10 // a0_r*b1_r + xsmulsp vs37, vs1, vs11 // a0_i*b1_i + xsmulsp vs38, vs0, vs11 // a0_r*b1_i + xsmulsp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmaddasp vs32, vs0, vs8 // a0_r*b0_r + xsmaddasp vs33, vs1, vs9 // a0_i*b0_i + xsmaddasp vs34, vs0, vs9 // a0_r*b0_i + xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddasp vs36, vs0, vs10 // a0_r*b1_r + xsmaddasp vs37, vs1, vs11 // a0_i*b1_i + xsmaddasp vs38, vs0, vs11 // a0_r*b1_i + xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro SAVE2x1 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsaddsp vs0, vs0, vs20 + xsaddsp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsaddsp vs0, vs0, vs20 + xsaddsp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x8 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x4 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x2 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs8, o0, TBUFFER + lxsspx vs9, o4, TBUFFER + lxsspx vs10, o8, TBUFFER + lxsspx vs11, o12, TBUFFER + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs12, o0, TBUFFER + lxsspx vs13, o4, TBUFFER + lxsspx vs14, o8, TBUFFER + lxsspx vs15, o12, TBUFFER + + XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r + xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i + xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i + xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + + xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + stxsspx vs20, o0, TBUFFER // store r0_r + stxsspx vs21, o4, TBUFFER // store r0_i + stxsspx vs22, o8, TBUFFER // store r1_r + stxsspx vs23, o12, TBUFFER // store r1_i + lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 // a0_r*b0_r + xsmulsp vs33, vs1, vs9 // a0_i*b0_i + xsmulsp vs34, vs0, vs9 // a0_r*b0_i + xsmulsp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 // a0_r*b0_r + xsmaddasp vs33, vs1, vs9 // a0_i*b0_i + xsmaddasp vs34, vs0, vs9 // a0_r*b0_i + xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmaddasp vs32, vs4, vs16 // a4_r*b0_r + xsmaddasp vs33, vs5, vs17 // a4_i*b0_i + xsmaddasp vs34, vs4, vs17 // a4_r*b0_i + xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddasp vs32, vs4, vs16 // a4_r*b0_r + xsmaddasp vs33, vs5, vs17 // a4_i*b0_i + xsmaddasp vs34, vs4, vs17 // a4_r*b0_i + xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + + +.endm + +.macro KERNEL1x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmulsp vs32, vs0, vs8 // a0_r*b0_r + xsmulsp vs33, vs1, vs9 // a0_i*b0_i + xsmulsp vs34, vs0, vs9 // a0_r*b0_i + xsmulsp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmaddasp vs32, vs0, vs8 // a0_r*b0_r + xsmaddasp vs33, vs1, vs9 // a0_i*b0_i + xsmaddasp vs34, vs0, vs9 // a0_r*b0_i + xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro SAVE1x1 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r + xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i + xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i + xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + + xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsaddsp vs0, vs0, vs20 + xsaddsp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S new file mode 100644 index 000000000..b15485751 --- /dev/null +++ b/kernel/power/ctrmm_kernel_8x4_power8.S @@ -0,0 +1,385 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" +#include "def_vsx.h" + +#ifndef __64BIT__ +#define LOAD lwz +#else +#define LOAD ld +#endif + +#ifdef __64BIT__ +#define STACKSIZE 400 +#define ALPHA_R_SP 304(SP) +#define ALPHA_I_SP 312(SP) +#else +#define STACKSIZE 256 +#define ALPHA_R_SP 224(SP) +#define ALPHA_I_SP 232(SP) +#define FZERO 240(SP) +#endif + +#define M r3 +#define N r4 +#define K r5 + +#ifdef linux +#ifndef __64BIT__ +#define A r6 +#define B r7 +#define C r8 +#define LDC r9 +#define OFFSET r10 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#if !defined(__64BIT__) && defined(DOUBLE) +#define A r10 +#define B r6 +#define C r7 +#define LDC r8 +#define OFFSET r9 +#else +#define A r8 +#define B r9 +#define C r10 +#define LDC r6 +#define OFFSET r7 +#endif +#endif + +#define o0 0 +#define alpha_r vs30 +#define alpha_i vs31 +#define alpha_vr vs28 +#define alpha_vi vs29 + + +#define o12 r12 +#define KKK r13 +#define K1 r14 +#define L r15 +#define o16 r16 +#define TBUFFER r17 +#define T2 r19 +#define KK r20 +#define o8 r21 +#define I r22 +#define J r23 +#define AO r24 +#define BO r25 +#define CO r26 +#define o4 r27 +#define o32 r28 +#define o48 r29 + +#define PRE r30 +#define T1 r31 + +#ifndef NEEDPARAM + + PROLOGUE + PROFCODE + + addi SP, SP, -STACKSIZE + li r0, 0 + + stfd f14, 0(SP) + stfd f15, 8(SP) + stfd f16, 16(SP) + stfd f17, 24(SP) + + stfd f18, 32(SP) + stfd f19, 40(SP) + stfd f20, 48(SP) + stfd f21, 56(SP) + + stfd f22, 64(SP) + stfd f23, 72(SP) + stfd f24, 80(SP) + stfd f25, 88(SP) + + stfd f26, 96(SP) + stfd f27, 104(SP) + stfd f28, 112(SP) + stfd f29, 120(SP) + + stfd f30, 128(SP) + stfd f31, 136(SP) + +#ifdef __64BIT__ + std r31, 144(SP) + std r30, 152(SP) + std r29, 160(SP) + std r28, 168(SP) + std r27, 176(SP) + std r26, 184(SP) + std r25, 192(SP) + std r24, 200(SP) + std r23, 208(SP) + std r22, 216(SP) + std r21, 224(SP) + std r20, 232(SP) + std r19, 240(SP) + std r18, 248(SP) + std r17, 256(SP) + std r16, 264(SP) + std r15, 272(SP) + std r14, 280(SP) + std r13, 288(SP) + std r12, 296(SP) +#else + stw r31, 144(SP) + stw r30, 148(SP) + stw r29, 152(SP) + stw r28, 156(SP) + stw r27, 160(SP) + stw r26, 164(SP) + stw r25, 168(SP) + stw r24, 172(SP) + stw r23, 176(SP) + stw r22, 180(SP) + stw r21, 184(SP) + stw r20, 188(SP) + stw r19, 192(SP) + stw r18, 196(SP) + stw r17, 200(SP) + stw r16, 204(SP) + stw r15, 208(SP) + stw r14, 212(SP) + stw r13, 216(SP) +#endif + + stfs f1, ALPHA_R_SP + stfs f2, ALPHA_I_SP + // stw r0, FZERO + +#ifdef linux +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld LDC, FRAMESLOT(0) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz B, FRAMESLOT(0) + STACKSIZE(SP) + lwz C, FRAMESLOT(1) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) +#else + lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) +#endif +#endif +#endif + +#ifdef TRMMKERNEL +#if defined(linux) && defined(__64BIT__) + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif + +#if defined(_AIX) || defined(__APPLE__) +#ifdef __64BIT__ + ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#else +#ifdef DOUBLE + lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) +#else + lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) +#endif +#endif +#endif +#if defined(TRMMKERNEL) && !defined(LEFT) + neg KK, OFFSET +#endif +#endif + +#include "cgemm_macros_8x4_power8.S" + + cmpwi cr0, M, 0 + ble .L999_H1 + cmpwi cr0, N, 0 + ble .L999_H1 + cmpwi cr0, K, 0 + ble .L999_H1 + + slwi LDC, LDC, ZBASE_SHIFT + li PRE, 256 + li o4 , 4 + li o8 , 8 + li o12 , 12 + li o16 , 16 + li o32 , 32 + li o48 , 48 + addi TBUFFER, SP, 360 + + +#ifdef __64BIT__ + addi T1, SP, 304 +#else + addi T1, SP, 224 +#endif + + lxsspx alpha_r, 0, T1 + lxsspx alpha_i, o8, T1 + + .align 5 + +#include "ctrmm_logic_8x4_power8.S" + +.L999: + addi r3, 0, 0 + + lfd f14, 0(SP) + lfd f15, 8(SP) + lfd f16, 16(SP) + lfd f17, 24(SP) + + lfd f18, 32(SP) + lfd f19, 40(SP) + lfd f20, 48(SP) + lfd f21, 56(SP) + + lfd f22, 64(SP) + lfd f23, 72(SP) + lfd f24, 80(SP) + lfd f25, 88(SP) + + lfd f26, 96(SP) + lfd f27, 104(SP) + lfd f28, 112(SP) + lfd f29, 120(SP) + + lfd f30, 128(SP) + lfd f31, 136(SP) + +#ifdef __64BIT__ + ld r31, 144(SP) + ld r30, 152(SP) + ld r29, 160(SP) + ld r28, 168(SP) + ld r27, 176(SP) + ld r26, 184(SP) + ld r25, 192(SP) + ld r24, 200(SP) + ld r23, 208(SP) + ld r22, 216(SP) + ld r21, 224(SP) + ld r20, 232(SP) + ld r19, 240(SP) + ld r18, 248(SP) + ld r17, 256(SP) + ld r16, 264(SP) + ld r15, 272(SP) + ld r14, 280(SP) + ld r13, 288(SP) + ld r12, 296(SP) +#else + lwz r31, 144(SP) + lwz r30, 148(SP) + lwz r29, 152(SP) + lwz r28, 156(SP) + lwz r27, 160(SP) + lwz r26, 164(SP) + lwz r25, 168(SP) + lwz r24, 172(SP) + lwz r23, 176(SP) + lwz r22, 180(SP) + lwz r21, 184(SP) + lwz r20, 188(SP) + lwz r19, 192(SP) + lwz r18, 196(SP) + lwz r17, 200(SP) + lwz r16, 204(SP) + lwz r15, 208(SP) + lwz r14, 212(SP) + lwz r13, 216(SP) +#endif + + addi SP, SP, STACKSIZE + + blr + + EPILOGUE +#endif diff --git a/kernel/power/ctrmm_logic_8x4_power8.S b/kernel/power/ctrmm_logic_8x4_power8.S new file mode 100644 index 000000000..f9656e90b --- /dev/null +++ b/kernel/power/ctrmm_logic_8x4_power8.S @@ -0,0 +1,1756 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + srawi. J, N, 2 + ble .LCTRMM_L4_END + +.LCTRMM_L4_BEGIN: + + mr CO, C + mr AO, A + slwi T1, LDC , 2 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 3 + ble .LCTRMM_L4x8_END + +.LCTRMM_L4x8_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L4x8_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L4x8_SUB4 + +.LCTRMM_L4x8_LOOP_START: + + LOAD4x8_1 + KERNEL4x8_I1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -2 + ble .LCTRMM_L4x8_LOOP_END + + .align 5 + +.LCTRMM_L4x8_LOOP: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + addic. L, L, -1 + bgt .LCTRMM_L4x8_LOOP + +.LCTRMM_L4x8_LOOP_END: + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_2 + + KERNEL4x8_1 + KERNEL4x8_2 + KERNEL4x8_1 + KERNEL4x8_E2 + + b .LCTRMM_L4x8_SUB1 + +.LCTRMM_L4x8_SUB4: + + KERNEL4x8_SUBI1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + KERNEL4x8_SUB1 + + b .LCTRMM_L4x8_SUB1 + +.LCTRMM_L4x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x8_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L4x8_SAVE + b .LCTRMM_L4x8_SUB2 + +.LCTRMM_L4x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L4x8_SAVE + +.LCTRMM_L4x8_SUB2: + + KERNEL4x8_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L4x8_SUB2 + +.LCTRMM_L4x8_SAVE: + + SAVE4x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LCTRMM_L4x8_BEGIN + +.LCTRMM_L4x8_END: + +.LCTRMM_L4x4_BEGIN: + andi. T2, M, 7 + ble .LCTRMM_L4x1_END + + andi. T1, M, 4 + ble .LCTRMM_L4x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L4x4_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L4x4_SUB4 + +.LCTRMM_L4x4_LOOP_START: + + LOAD4x4_1 + KERNEL4x4_I1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -2 + ble .LCTRMM_L4x4_LOOP_END + + .align 5 + +.LCTRMM_L4x4_LOOP: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + addic. L, L, -1 + bgt .LCTRMM_L4x4_LOOP + +.LCTRMM_L4x4_LOOP_END: + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_2 + + KERNEL4x4_1 + KERNEL4x4_2 + KERNEL4x4_1 + KERNEL4x4_E2 + + b .LCTRMM_L4x4_SUB1 + +.LCTRMM_L4x4_SUB4: + + KERNEL4x4_SUBI1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + KERNEL4x4_SUB1 + + b .LCTRMM_L4x4_SUB1 + +.LCTRMM_L4x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x4_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L4x4_SAVE + b .LCTRMM_L4x4_SUB2 + +.LCTRMM_L4x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L4x4_SAVE + +.LCTRMM_L4x4_SUB2: + + KERNEL4x4_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L4x4_SUB2 + +.LCTRMM_L4x4_SAVE: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LCTRMM_L4x4_END: + +.LCTRMM_L4x2_BEGIN: + + andi. T1, M, 2 + ble .LCTRMM_L4x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L4x2_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L4x2_SUB4 + +.LCTRMM_L4x2_LOOP_START: + + LOAD4x2_1 + KERNEL4x2_I1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -2 + ble .LCTRMM_L4x2_LOOP_END + + .align 5 + +.LCTRMM_L4x2_LOOP: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + addic. L, L, -1 + bgt .LCTRMM_L4x2_LOOP + +.LCTRMM_L4x2_LOOP_END: + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_2 + + KERNEL4x2_1 + KERNEL4x2_2 + KERNEL4x2_1 + KERNEL4x2_E2 + + b .LCTRMM_L4x2_SUB1 + +.LCTRMM_L4x2_SUB4: + + KERNEL4x2_SUBI1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + KERNEL4x2_SUB1 + + b .LCTRMM_L4x2_SUB1 + +.LCTRMM_L4x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x2_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L4x2_SAVE + b .LCTRMM_L4x2_SUB2 + +.LCTRMM_L4x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L4x2_SAVE + +.LCTRMM_L4x2_SUB2: + + KERNEL4x2_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L4x2_SUB2 + +.LCTRMM_L4x2_SAVE: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LCTRMM_L4x2_END: + +.LCTRMM_L4x1_BEGIN: + + andi. T1, M, 1 + ble .LCTRMM_L4x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 5 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 4 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L4x1_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L4x1_SUB4 + +.LCTRMM_L4x1_LOOP_START: + + LOAD4x1_1 + KERNEL4x1_I1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -2 + ble .LCTRMM_L4x1_LOOP_END + + .align 5 + +.LCTRMM_L4x1_LOOP: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + addic. L, L, -1 + bgt .LCTRMM_L4x1_LOOP + +.LCTRMM_L4x1_LOOP_END: + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_2 + + KERNEL4x1_1 + KERNEL4x1_2 + KERNEL4x1_1 + KERNEL4x1_E2 + + b .LCTRMM_L4x1_SUB1 + +.LCTRMM_L4x1_SUB4: + + KERNEL4x1_SUBI1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + KERNEL4x1_SUB1 + + b .LCTRMM_L4x1_SUB1 + +.LCTRMM_L4x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL4x1_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L4x1_SAVE + b .LCTRMM_L4x1_SUB2 + +.LCTRMM_L4x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L4x1_SAVE + +.LCTRMM_L4x1_SUB2: + + KERNEL4x1_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L4x1_SUB2 + +.LCTRMM_L4x1_SAVE: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 5 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LCTRMM_L4x1_END: + + slwi T1, K, 5 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 4 // KK += Number of values in B +#endif + + + addic. J, J, -1 + bgt .LCTRMM_L4_BEGIN + + andi. T2, N, 3 + ble .L999_H2 + +.LCTRMM_L4_END: + + b .LCTRMM_L2_BEGIN + +.L999_H1: + + b .L999_H2 + +.LCTRMM_L2_BEGIN: + + andi. T1, N, 2 + ble .LCTRMM_L2_END + mr CO, C + mr AO, A + slwi T1, LDC , 1 + add C, C, T1 + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 3 + ble .LCTRMM_L2x8_END + +.LCTRMM_L2x8_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L2x8_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L2x8_SUB4 + +.LCTRMM_L2x8_LOOP_START: + + LOAD2x8_1 + KERNEL2x8_I1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -2 + ble .LCTRMM_L2x8_LOOP_END + + .align 5 + +.LCTRMM_L2x8_LOOP: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + addic. L, L, -1 + bgt .LCTRMM_L2x8_LOOP + +.LCTRMM_L2x8_LOOP_END: + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_2 + + KERNEL2x8_1 + KERNEL2x8_2 + KERNEL2x8_1 + KERNEL2x8_E2 + + b .LCTRMM_L2x8_SUB1 + +.LCTRMM_L2x8_SUB4: + + KERNEL2x8_SUBI1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + KERNEL2x8_SUB1 + + b .LCTRMM_L2x8_SUB1 + +.LCTRMM_L2x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x8_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L2x8_SAVE + b .LCTRMM_L2x8_SUB2 + +.LCTRMM_L2x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L2x8_SAVE + +.LCTRMM_L2x8_SUB2: + + KERNEL2x8_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L2x8_SUB2 + +.LCTRMM_L2x8_SAVE: + + SAVE2x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LCTRMM_L2x8_BEGIN + +.LCTRMM_L2x8_END: + +.LCTRMM_L2x4_BEGIN: + andi. T2, M, 7 + ble .LCTRMM_L2x1_END + + andi. T1, M, 4 + ble .LCTRMM_L2x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L2x4_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L2x4_SUB4 + +.LCTRMM_L2x4_LOOP_START: + + LOAD2x4_1 + KERNEL2x4_I1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -2 + ble .LCTRMM_L2x4_LOOP_END + + .align 5 + +.LCTRMM_L2x4_LOOP: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + addic. L, L, -1 + bgt .LCTRMM_L2x4_LOOP + +.LCTRMM_L2x4_LOOP_END: + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_2 + + KERNEL2x4_1 + KERNEL2x4_2 + KERNEL2x4_1 + KERNEL2x4_E2 + + b .LCTRMM_L2x4_SUB1 + +.LCTRMM_L2x4_SUB4: + + KERNEL2x4_SUBI1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + KERNEL2x4_SUB1 + + b .LCTRMM_L2x4_SUB1 + +.LCTRMM_L2x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x4_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L2x4_SAVE + b .LCTRMM_L2x4_SUB2 + +.LCTRMM_L2x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L2x4_SAVE + +.LCTRMM_L2x4_SUB2: + + KERNEL2x4_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L2x4_SUB2 + +.LCTRMM_L2x4_SAVE: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LCTRMM_L2x4_END: + +.LCTRMM_L2x2_BEGIN: + + andi. T1, M, 2 + ble .LCTRMM_L2x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L2x2_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L2x2_SUB4 + +.LCTRMM_L2x2_LOOP_START: + + LOAD2x2_1 + KERNEL2x2_I1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -2 + ble .LCTRMM_L2x2_LOOP_END + + .align 5 + +.LCTRMM_L2x2_LOOP: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + addic. L, L, -1 + bgt .LCTRMM_L2x2_LOOP + +.LCTRMM_L2x2_LOOP_END: + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_2 + + KERNEL2x2_1 + KERNEL2x2_2 + KERNEL2x2_1 + KERNEL2x2_E2 + + b .LCTRMM_L2x2_SUB1 + +.LCTRMM_L2x2_SUB4: + + KERNEL2x2_SUBI1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + KERNEL2x2_SUB1 + + b .LCTRMM_L2x2_SUB1 + +.LCTRMM_L2x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x2_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L2x2_SAVE + b .LCTRMM_L2x2_SUB2 + +.LCTRMM_L2x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L2x2_SAVE + +.LCTRMM_L2x2_SUB2: + + KERNEL2x2_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L2x2_SUB2 + +.LCTRMM_L2x2_SAVE: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LCTRMM_L2x2_END: + +.LCTRMM_L2x1_BEGIN: + + andi. T1, M, 1 + ble .LCTRMM_L2x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 4 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 2 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L2x1_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L2x1_SUB4 + +.LCTRMM_L2x1_LOOP_START: + + LOAD2x1_1 + KERNEL2x1_I1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -2 + ble .LCTRMM_L2x1_LOOP_END + + .align 5 + +.LCTRMM_L2x1_LOOP: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + addic. L, L, -1 + bgt .LCTRMM_L2x1_LOOP + +.LCTRMM_L2x1_LOOP_END: + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_2 + + KERNEL2x1_1 + KERNEL2x1_2 + KERNEL2x1_1 + KERNEL2x1_E2 + + b .LCTRMM_L2x1_SUB1 + +.LCTRMM_L2x1_SUB4: + + KERNEL2x1_SUBI1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + KERNEL2x1_SUB1 + + b .LCTRMM_L2x1_SUB1 + +.LCTRMM_L2x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL2x1_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L2x1_SAVE + b .LCTRMM_L2x1_SUB2 + +.LCTRMM_L2x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L2x1_SAVE + +.LCTRMM_L2x1_SUB2: + + KERNEL2x1_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L2x1_SUB2 + +.LCTRMM_L2x1_SAVE: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 4 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LCTRMM_L2x1_END: + + slwi T1, K, 4 + add B, B, T1 + +#if !defined(LEFT) + addi KK, KK, 2 // KK += Number of values in B +#endif + + +.LCTRMM_L2_END: + + b .LCTRMM_L1_BEGIN + +.L999_H2: + + b .L999 + +.LCTRMM_L1_BEGIN: + + andi. T1, N, 1 + ble .LCTRMM_L1_END + mr CO, C + mr AO, A + +#if defined(LEFT) + mr KK, OFFSET // OFFSET -> KK +#endif + + srawi. I, M, 3 + ble .LCTRMM_L1x8_END + +.LCTRMM_L1x8_BEGIN: + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 6 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 8 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L1x8_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L1x8_SUB4 + +.LCTRMM_L1x8_LOOP_START: + + LOAD1x8_1 + KERNEL1x8_I1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -2 + ble .LCTRMM_L1x8_LOOP_END + + .align 5 + +.LCTRMM_L1x8_LOOP: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + addic. L, L, -1 + bgt .LCTRMM_L1x8_LOOP + +.LCTRMM_L1x8_LOOP_END: + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_2 + + KERNEL1x8_1 + KERNEL1x8_2 + KERNEL1x8_1 + KERNEL1x8_E2 + + b .LCTRMM_L1x8_SUB1 + +.LCTRMM_L1x8_SUB4: + + KERNEL1x8_SUBI1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + KERNEL1x8_SUB1 + + b .LCTRMM_L1x8_SUB1 + +.LCTRMM_L1x8_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x8_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L1x8_SAVE + b .LCTRMM_L1x8_SUB2 + +.LCTRMM_L1x8_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L1x8_SAVE + +.LCTRMM_L1x8_SUB2: + + KERNEL1x8_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L1x8_SUB2 + +.LCTRMM_L1x8_SAVE: + + SAVE1x8 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 6 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 8 // KK += Number of values in A +#endif + + + addic. I, I, -1 + bgt .LCTRMM_L1x8_BEGIN + +.LCTRMM_L1x8_END: + +.LCTRMM_L1x4_BEGIN: + andi. T2, M, 7 + ble .LCTRMM_L1x1_END + + andi. T1, M, 4 + ble .LCTRMM_L1x4_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 5 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 4 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L1x4_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L1x4_SUB4 + +.LCTRMM_L1x4_LOOP_START: + + LOAD1x4_1 + KERNEL1x4_I1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -2 + ble .LCTRMM_L1x4_LOOP_END + + .align 5 + +.LCTRMM_L1x4_LOOP: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + addic. L, L, -1 + bgt .LCTRMM_L1x4_LOOP + +.LCTRMM_L1x4_LOOP_END: + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_2 + + KERNEL1x4_1 + KERNEL1x4_2 + KERNEL1x4_1 + KERNEL1x4_E2 + + b .LCTRMM_L1x4_SUB1 + +.LCTRMM_L1x4_SUB4: + + KERNEL1x4_SUBI1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + KERNEL1x4_SUB1 + + b .LCTRMM_L1x4_SUB1 + +.LCTRMM_L1x4_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x4_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L1x4_SAVE + b .LCTRMM_L1x4_SUB2 + +.LCTRMM_L1x4_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L1x4_SAVE + +.LCTRMM_L1x4_SUB2: + + KERNEL1x4_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L1x4_SUB2 + +.LCTRMM_L1x4_SAVE: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 5 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 4 // KK += Number of values in A +#endif + + +.LCTRMM_L1x4_END: + +.LCTRMM_L1x2_BEGIN: + + andi. T1, M, 2 + ble .LCTRMM_L1x2_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 4 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 2 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L1x2_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L1x2_SUB4 + +.LCTRMM_L1x2_LOOP_START: + + LOAD1x2_1 + KERNEL1x2_I1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -2 + ble .LCTRMM_L1x2_LOOP_END + + .align 5 + +.LCTRMM_L1x2_LOOP: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + addic. L, L, -1 + bgt .LCTRMM_L1x2_LOOP + +.LCTRMM_L1x2_LOOP_END: + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_2 + + KERNEL1x2_1 + KERNEL1x2_2 + KERNEL1x2_1 + KERNEL1x2_E2 + + b .LCTRMM_L1x2_SUB1 + +.LCTRMM_L1x2_SUB4: + + KERNEL1x2_SUBI1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + KERNEL1x2_SUB1 + + b .LCTRMM_L1x2_SUB1 + +.LCTRMM_L1x2_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x2_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L1x2_SAVE + b .LCTRMM_L1x2_SUB2 + +.LCTRMM_L1x2_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L1x2_SAVE + +.LCTRMM_L1x2_SUB2: + + KERNEL1x2_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L1x2_SUB2 + +.LCTRMM_L1x2_SAVE: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 4 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 2 // KK += Number of values in A +#endif + + +.LCTRMM_L1x2_END: + +.LCTRMM_L1x1_BEGIN: + + andi. T1, M, 1 + ble .LCTRMM_L1x1_END + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mr BO, B // B -> BO +#else + mr BO, B // B -> BO + slwi T1, KK, 3 // Number of values in B shifted + slwi T2, KK, 3 // Number of values in A shifted + add BO, BO, T1 // Add values to BO + add AO, AO, T2 // Add values to AO +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub T1, K, KK // K - KK -> TEMP1 +#else + mr T1, KK // KK -> KTEMP +#ifdef LEFT + addi T1, T1, 1 // KTEMP + Number of values in A -> KTEMP +#else + addi T1, T1, 1 // KTEMP + Number of values in B -> KTEMP +#endif +#endif + + mr KKK, T1 + mr K1, T1 + srawi. L, K1, 3 // KTEMP / 8 -> L + ble .LCTRMM_L1x1_SUB0 + cmpwi cr0, L, 1 + ble .LCTRMM_L1x1_SUB4 + +.LCTRMM_L1x1_LOOP_START: + + LOAD1x1_1 + KERNEL1x1_I1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -2 + ble .LCTRMM_L1x1_LOOP_END + + .align 5 + +.LCTRMM_L1x1_LOOP: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + addic. L, L, -1 + bgt .LCTRMM_L1x1_LOOP + +.LCTRMM_L1x1_LOOP_END: + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_2 + + KERNEL1x1_1 + KERNEL1x1_2 + KERNEL1x1_1 + KERNEL1x1_E2 + + b .LCTRMM_L1x1_SUB1 + +.LCTRMM_L1x1_SUB4: + + KERNEL1x1_SUBI1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + KERNEL1x1_SUB1 + + b .LCTRMM_L1x1_SUB1 + +.LCTRMM_L1x1_SUB0: + + andi. L, K1, 7 // K1 & 7 -> L + + KERNEL1x1_SUBI1 + + addic. L, L, -1 + ble .LCTRMM_L1x1_SAVE + b .LCTRMM_L1x1_SUB2 + +.LCTRMM_L1x1_SUB1: + + andi. L, K1, 7 // K1 & 7 -> L + ble .LCTRMM_L1x1_SAVE + +.LCTRMM_L1x1_SUB2: + + KERNEL1x1_SUB1 + + addic. L, L, -1 + bgt .LCTRMM_L1x1_SUB2 + +.LCTRMM_L1x1_SAVE: + + SAVE1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub T1, K, KKK // K - KKK -> TEMP1 + slwi T2, T1, 3 // TEMP1 * Number of values in B shifted -> TEMP2 + slwi T1, T1, 3 // TEMP1 * Number of values in A shifted -> TEMP1 + add BO, BO, T2 // BO += TEMP2 * number of values in B shifted + add AO, AO, T1 // AO += TEMP1 * number of values in A shifted +#endif + +#if defined(LEFT) + addi KK, KK, 1 // KK += Number of values in A +#endif + + +.LCTRMM_L1x1_END: + +#if !defined(LEFT) + addi KK, KK, 1 // KK += Number of values in B +#endif + + +.LCTRMM_L1_END: diff --git a/param.h b/param.h index f5d1ab2ea..980650e09 100644 --- a/param.h +++ b/param.h @@ -1972,23 +1972,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_N 8 #define DGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 #define SGEMM_DEFAULT_P 960 #define DGEMM_DEFAULT_P 480 -#define CGEMM_DEFAULT_P 488 +#define CGEMM_DEFAULT_P 480 #define ZGEMM_DEFAULT_P 240 #define SGEMM_DEFAULT_Q 720 #define DGEMM_DEFAULT_Q 720 -#define CGEMM_DEFAULT_Q 400 +#define CGEMM_DEFAULT_Q 720 #define ZGEMM_DEFAULT_Q 360 #define SGEMM_DEFAULT_R 28800 #define DGEMM_DEFAULT_R 14400 +#define CGEMM_DEFAULT_R 14400 #define ZGEMM_DEFAULT_R 7200 #define SYMV_P 8 From e1df5a6e23c2ab73385984289f24472cb2f0cb66 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 18 Mar 2016 12:12:03 +0100 Subject: [PATCH 05/48] fixed sgemm- and strmm-kernel --- kernel/power/sgemm_kernel_16x8_power8.S | 22 +- kernel/power/sgemm_logic_16x8_power8.S | 4 +- kernel/power/sgemm_macros_16x8_power8.S | 2978 +++++++++++++++++++---- kernel/power/strmm_kernel_16x8_power8.S | 21 +- kernel/power/strmm_logic_16x8_power8.S | 4 +- param.h | 4 +- 6 files changed, 2597 insertions(+), 436 deletions(-) diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S index 9f221301a..031f342ad 100644 --- a/kernel/power/sgemm_kernel_16x8_power8.S +++ b/kernel/power/sgemm_kernel_16x8_power8.S @@ -26,10 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ @@ -81,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __64BIT__ -#define STACKSIZE 320 +#define STACKSIZE 340 #define ALPHA_SP 296(SP) #define FZERO 304(SP) #else @@ -127,10 +128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define alpha_r vs30 -#define alpha_vr vs31 #define o0 0 +#define TBUFFER r14 #define o4 r15 #define o12 r16 #define o8 r17 @@ -202,6 +203,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) + std r14, 280(SP) #else stw r31, 144(SP) stw r30, 148(SP) @@ -220,6 +222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stw r17, 200(SP) stw r16, 204(SP) stw r15, 208(SP) + stw r14, 212(SP) #endif // stfd f1, ALPHA_SP @@ -259,24 +262,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmpwi cr0, K, 0 ble .L999_H1 - li PRE, 384 + li PRE, 256 li o4 , 4 li o8 , 8 li o12, 12 li o16, 16 li o32, 32 li o48, 48 + addi TBUFFER, SP, 320 addi T1, SP, 300 stfs f1, 0(T1) - stfs f1, 4(T1) - stfs f1, 8(T1) - stfs f1,12(T1) - lxsspx vs28, 0, T1 - - xxspltw alpha_r, vs28 , 0 - lxvw4x alpha_vr, 0, T1 + lxsspx alpha_r, 0, T1 @@ -326,6 +324,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) + ld r14, 280(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) @@ -344,6 +343,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lwz r17, 200(SP) lwz r16, 204(SP) lwz r15, 208(SP) + lwz r14, 212(SP) #endif addi SP, SP, STACKSIZE diff --git a/kernel/power/sgemm_logic_16x8_power8.S b/kernel/power/sgemm_logic_16x8_power8.S index 6c5a1c7ef..0ae6413ce 100644 --- a/kernel/power/sgemm_logic_16x8_power8.S +++ b/kernel/power/sgemm_logic_16x8_power8.S @@ -26,13 +26,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ - srawi. J, N, 3 ble .LSGEMM_L8_END diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S index 78f530cfa..a2d36c089 100644 --- a/kernel/power/sgemm_macros_16x8_power8.S +++ b/kernel/power/sgemm_macros_16x8_power8.S @@ -26,10 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ /********************************************************************************************** @@ -38,49 +39,65 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD8x16_1 - lxvw4x vs28, o0, BO - lxvw4x vs29, o16, BO - lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + xxspltw vs8, vs28, 0 xxspltw vs9, vs28, 1 xxspltw vs10, vs28, 2 xxspltw vs11, vs28, 3 + lxvw4x vs29, o16, BO + xxspltw vs12, vs29, 0 xxspltw vs13, vs29, 1 - addi AO, AO, 64 - addi BO, BO, 32 xxspltw vs14, vs29, 2 xxspltw vs15, vs29, 3 + addi BO, BO, 32 .endm .macro KERNEL8x16_I1 - xvmulsp vs32, vs0, vs8 - xvmulsp vs33, vs1, vs8 lxvw4x vs4, o0, AO lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO - xvmulsp vs34, vs2, vs8 - xvmulsp vs35, vs3, vs8 + addi AO, AO, 64 lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + lxvw4x vs29, o16, BO - xvmulsp vs36, vs0, vs9 - xvmulsp vs37, vs1, vs9 + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 - lxvw4x vs6, o32, AO - lxvw4x vs7, o48, AO + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 xvmulsp vs38, vs2, vs9 xvmulsp vs39, vs3, vs9 @@ -104,27 +121,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs54, vs2, vs13 xvmulsp vs55, vs3, vs13 - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 - xvmulsp vs56, vs0, vs14 xvmulsp vs57, vs1, vs14 xvmulsp vs58, vs2, vs14 xvmulsp vs59, vs3, vs14 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 - xvmulsp vs60, vs0, vs15 xvmulsp vs61, vs1, vs15 - - addi AO, AO, 64 - addi BO, BO, 32 - xvmulsp vs62, vs2, vs15 xvmulsp vs63, vs3, vs15 @@ -135,36 +138,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 - + lxvw4x vs28, o0, BO lxvw4x vs4, o0, AO - lxvw4x vs5, o16, AO - xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 - lxvw4x vs28, o0, BO + lxvw4x vs29, o16, BO + lxvw4x vs5, o16, AO xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 xvmaddasp vs40, vs0, vs10 xvmaddasp vs41, vs1, vs10 - lxvw4x vs6, o32, AO lxvw4x vs7, o48, AO - xvmaddasp vs42, vs2, vs10 xvmaddasp vs43, vs3, vs10 + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + xvmaddasp vs44, vs0, vs11 xvmaddasp vs45, vs1, vs11 - - lxvw4x vs29, o16, BO - xvmaddasp vs46, vs2, vs11 xvmaddasp vs47, vs3, vs11 + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + xvmaddasp vs48, vs0, vs12 xvmaddasp vs49, vs1, vs12 xvmaddasp vs50, vs2, vs12 @@ -172,36 +179,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs52, vs0, vs13 xvmaddasp vs53, vs1, vs13 - - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xvmaddasp vs54, vs2, vs13 xvmaddasp vs55, vs3, vs13 xvmaddasp vs56, vs0, vs14 xvmaddasp vs57, vs1, vs14 - - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 - + addi AO, AO, 64 + addi BO, BO, 32 xvmaddasp vs58, vs2, vs14 xvmaddasp vs59, vs3, vs14 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xvmaddasp vs60, vs0, vs15 xvmaddasp vs61, vs1, vs15 - - addi AO, AO, 64 - addi BO, BO, 32 - xvmaddasp vs62, vs2, vs15 xvmaddasp vs63, vs3, vs15 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 .endm @@ -210,8 +202,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 + lxvw4x vs28, o0, BO lxvw4x vs0, o0, AO - lxvw4x vs1, o16, AO xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 @@ -219,28 +211,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 - lxvw4x vs28, o0, BO + lxvw4x vs29, o16, BO + lxvw4x vs1, o16, AO xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 - xvmaddasp vs40, vs4, vs18 - xvmaddasp vs41, vs5, vs18 - lxvw4x vs2, o32, AO lxvw4x vs3, o48, AO + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 xvmaddasp vs42, vs6, vs18 xvmaddasp vs43, vs7, vs18 + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + xvmaddasp vs44, vs4, vs19 xvmaddasp vs45, vs5, vs19 - - lxvw4x vs29, o16, BO - xvmaddasp vs46, vs6, vs19 xvmaddasp vs47, vs7, vs19 + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + xvmaddasp vs48, vs4, vs20 xvmaddasp vs49, vs5, vs20 xvmaddasp vs50, vs6, vs20 @@ -248,32 +247,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs52, vs4, vs21 xvmaddasp vs53, vs5, vs21 - - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 - xvmaddasp vs54, vs6, vs21 xvmaddasp vs55, vs7, vs21 xvmaddasp vs56, vs4, vs22 xvmaddasp vs57, vs5, vs22 - - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 - xvmaddasp vs58, vs6, vs22 xvmaddasp vs59, vs7, vs22 xvmaddasp vs60, vs4, vs23 xvmaddasp vs61, vs5, vs23 - addi AO, AO, 64 addi BO, BO, 32 - xvmaddasp vs62, vs6, vs23 xvmaddasp vs63, vs7, vs23 @@ -479,22 +464,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs33, o0, TBUFFER - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr - xvmulsp vs2, vs34, alpha_vr - xvmulsp vs3, vs35, alpha_vr + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr - xvmaddasp vs2, vs34, alpha_vr - xvmaddasp vs3, vs35, alpha_vr + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 #endif + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -512,22 +581,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r - xvmulsp vs0, vs36, alpha_vr - xvmulsp vs1, vs37, alpha_vr - xvmulsp vs2, vs38, alpha_vr - xvmulsp vs3, vs39, alpha_vr + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs36, alpha_vr - xvmaddasp vs1, vs37, alpha_vr - xvmaddasp vs2, vs38, alpha_vr - xvmaddasp vs3, vs39, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 #endif + stxvw4x vs39, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -545,22 +698,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs40, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs41, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs42, o0, TBUFFER - xvmulsp vs0, vs40, alpha_vr - xvmulsp vs1, vs41, alpha_vr - xvmulsp vs2, vs42, alpha_vr - xvmulsp vs3, vs43, alpha_vr + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs43, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r - xvmaddasp vs0, vs40, alpha_vr - xvmaddasp vs1, vs41, alpha_vr - xvmaddasp vs2, vs42, alpha_vr - xvmaddasp vs3, vs43, alpha_vr + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -578,22 +815,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs44, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs45, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif - xvmulsp vs0, vs44, alpha_vr - xvmulsp vs1, vs45, alpha_vr - xvmulsp vs2, vs46, alpha_vr - xvmulsp vs3, vs47, alpha_vr + stxvw4x vs46, o0, TBUFFER + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs47, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs44, alpha_vr - xvmaddasp vs1, vs45, alpha_vr - xvmaddasp vs2, vs46, alpha_vr - xvmaddasp vs3, vs47, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -611,22 +932,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs48, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs49, o0, TBUFFER - xvmulsp vs0, vs48, alpha_vr - xvmulsp vs1, vs49, alpha_vr - xvmulsp vs2, vs50, alpha_vr - xvmulsp vs3, vs51, alpha_vr + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs50, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r - xvmaddasp vs0, vs48, alpha_vr - xvmaddasp vs1, vs49, alpha_vr - xvmaddasp vs2, vs50, alpha_vr - xvmaddasp vs3, vs51, alpha_vr + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 #endif + stxvw4x vs51, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -644,22 +1049,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs52, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs53, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs54, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r - xvmulsp vs0, vs52, alpha_vr - xvmulsp vs1, vs53, alpha_vr - xvmulsp vs2, vs54, alpha_vr - xvmulsp vs3, vs55, alpha_vr + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs55, o0, TBUFFER - xvmaddasp vs0, vs52, alpha_vr - xvmaddasp vs1, vs53, alpha_vr - xvmaddasp vs2, vs54, alpha_vr - xvmaddasp vs3, vs55, alpha_vr + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -677,22 +1166,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs56, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif - xvmulsp vs0, vs56, alpha_vr - xvmulsp vs1, vs57, alpha_vr - xvmulsp vs2, vs58, alpha_vr - xvmulsp vs3, vs59, alpha_vr + stxvw4x vs57, o0, TBUFFER + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs58, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs56, alpha_vr - xvmaddasp vs1, vs57, alpha_vr - xvmaddasp vs2, vs58, alpha_vr - xvmaddasp vs3, vs59, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 #endif + stxvw4x vs59, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 +#endif + + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -710,22 +1283,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs60, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs61, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs62, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmulsp vs0, vs60, alpha_vr - xvmulsp vs1, vs61, alpha_vr - xvmulsp vs2, vs62, alpha_vr - xvmulsp vs3, vs63, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif - xvmaddasp vs0, vs60, alpha_vr - xvmaddasp vs1, vs61, alpha_vr - xvmaddasp vs2, vs62, alpha_vr - xvmaddasp vs3, vs63, alpha_vr + stxvw4x vs63, o0, TBUFFER + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -1068,17 +1725,187 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + + + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif -#else - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr -#endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1093,42 +1920,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - - xvmulsp vs0, vs34, alpha_vr - xvmulsp vs1, vs35, alpha_vr -#else + stxvw4x vs38, o0, TBUFFER - xvmaddasp vs0, vs34, alpha_vr - xvmaddasp vs1, vs35, alpha_vr + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER -#endif + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r - stxvw4x vs0, o0, T1 - stxvw4x vs1, o16, T1 + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER - add T1, T1, LDC +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + stxvw4x vs39, o0, TBUFFER -#ifndef TRMMKERNEL + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - lxvw4x vs0, o0, T1 - lxvw4x vs1, o16, T1 + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r -#endif + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER #ifdef TRMMKERNEL - - xvmulsp vs0, vs36, alpha_vr - xvmulsp vs1, vs37, alpha_vr - + lxvw4x vs1, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + - xvmaddasp vs0, vs36, alpha_vr - xvmaddasp vs1, vs37, alpha_vr -#endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1143,42 +1985,57 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - - xvmulsp vs0, vs38, alpha_vr - xvmulsp vs1, vs39, alpha_vr -#else + stxvw4x vs40, o0, TBUFFER - xvmaddasp vs0, vs38, alpha_vr - xvmaddasp vs1, vs39, alpha_vr + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER -#endif + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r - stxvw4x vs0, o0, T1 - stxvw4x vs1, o16, T1 + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER - add T1, T1, LDC +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + stxvw4x vs41, o0, TBUFFER -#ifndef TRMMKERNEL + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - lxvw4x vs0, o0, T1 - lxvw4x vs1, o16, T1 + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r -#endif + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER #ifdef TRMMKERNEL - - xvmulsp vs0, vs40, alpha_vr - xvmulsp vs1, vs41, alpha_vr - + lxvw4x vs1, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + - xvmaddasp vs0, vs40, alpha_vr - xvmaddasp vs1, vs41, alpha_vr -#endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1193,18 +2050,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs42, alpha_vr - xvmulsp vs1, vs43, alpha_vr + stxvw4x vs42, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs43, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs42, alpha_vr - xvmaddasp vs1, vs43, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1218,18 +2115,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs44, alpha_vr - xvmulsp vs1, vs45, alpha_vr + stxvw4x vs44, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif - xvmaddasp vs0, vs44, alpha_vr - xvmaddasp vs1, vs45, alpha_vr + stxvw4x vs45, o0, TBUFFER + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1243,18 +2180,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs46, alpha_vr - xvmulsp vs1, vs47, alpha_vr + stxvw4x vs46, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs47, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs46, alpha_vr - xvmaddasp vs1, vs47, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1540,16 +2517,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs32, alpha_vr + stxvw4x vs32, o0, TBUFFER -#else + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs32, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -1561,16 +2556,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs33, alpha_vr + stxvw4x vs33, o0, TBUFFER -#else + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs33, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -1582,16 +2595,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs34, alpha_vr + stxvw4x vs34, o0, TBUFFER -#else + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs34, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -1603,16 +2634,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs35, alpha_vr + stxvw4x vs35, o0, TBUFFER -#else + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs35, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -1624,16 +2673,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs36, alpha_vr + stxvw4x vs36, o0, TBUFFER -#else + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs36, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -1645,16 +2712,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs37, alpha_vr + stxvw4x vs37, o0, TBUFFER -#else + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs37, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -1666,16 +2751,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs38, alpha_vr + stxvw4x vs38, o0, TBUFFER -#else + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs38, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -1687,16 +2790,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs39, alpha_vr + stxvw4x vs39, o0, TBUFFER -#else + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs39, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2043,8 +3164,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r - xsmaddasp vs1, vs33, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs33, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2068,8 +3191,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs34, alpha_r - xsmaddasp vs1, vs35, alpha_r + xsmulsp vs28, vs34, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs35, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2093,8 +3218,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs36, alpha_r - xsmaddasp vs1, vs37, alpha_r + xsmulsp vs28, vs36, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs37, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2118,8 +3245,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs38, alpha_r - xsmaddasp vs1, vs39, alpha_r + xsmulsp vs28, vs38, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs39, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2143,8 +3272,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs40, alpha_r - xsmaddasp vs1, vs41, alpha_r + xsmulsp vs28, vs40, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs41, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2168,8 +3299,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs42, alpha_r - xsmaddasp vs1, vs43, alpha_r + xsmulsp vs28, vs42, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs43, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2193,8 +3326,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs44, alpha_r - xsmaddasp vs1, vs45, alpha_r + xsmulsp vs28, vs44, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs45, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2218,8 +3353,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs46, alpha_r - xsmaddasp vs1, vs47, alpha_r + xsmulsp vs28, vs46, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs47, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -2514,7 +3651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2535,7 +3673,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs33, alpha_r + xsmulsp vs28, vs33, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2556,7 +3695,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs34, alpha_r + xsmulsp vs28, vs34, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2577,7 +3717,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs35, alpha_r + xsmulsp vs28, vs35, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2598,7 +3739,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs36, alpha_r + xsmulsp vs28, vs36, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2619,7 +3761,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs37, alpha_r + xsmulsp vs28, vs37, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2640,7 +3783,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs38, alpha_r + xsmulsp vs28, vs38, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2661,7 +3805,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs39, alpha_r + xsmulsp vs28, vs39, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -2952,22 +4097,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr - xvmulsp vs2, vs34, alpha_vr - xvmulsp vs3, vs35, alpha_vr + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr - xvmaddasp vs2, vs34, alpha_vr - xvmaddasp vs3, vs35, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -2985,22 +4214,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r - xvmulsp vs0, vs36, alpha_vr - xvmulsp vs1, vs37, alpha_vr - xvmulsp vs2, vs38, alpha_vr - xvmulsp vs3, vs39, alpha_vr + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs36, alpha_vr - xvmaddasp vs1, vs37, alpha_vr - xvmaddasp vs2, vs38, alpha_vr - xvmaddasp vs3, vs39, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -3018,55 +4331,223 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs40, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs41, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs42, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r - xvmulsp vs0, vs40, alpha_vr - xvmulsp vs1, vs41, alpha_vr - xvmulsp vs2, vs42, alpha_vr - xvmulsp vs3, vs43, alpha_vr + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs43, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs40, alpha_vr - xvmaddasp vs1, vs41, alpha_vr - xvmaddasp vs2, vs42, alpha_vr - xvmaddasp vs3, vs43, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 stxvw4x vs3, o48, T1 - add T1, T1, LDC + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + + + stxvw4x vs44, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs45, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs46, o0, TBUFFER + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER -#ifndef TRMMKERNEL + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r - lxvw4x vs0, o0, T1 - lxvw4x vs1, o16, T1 - lxvw4x vs2, o32, T1 - lxvw4x vs3, o48, T1 + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 #endif -#ifdef TRMMKERNEL + stxvw4x vs47, o0, TBUFFER - xvmulsp vs0, vs44, alpha_vr - xvmulsp vs1, vs45, alpha_vr - xvmulsp vs2, vs46, alpha_vr - xvmulsp vs3, vs47, alpha_vr + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER -#else + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r - xvmaddasp vs0, vs44, alpha_vr - xvmaddasp vs1, vs45, alpha_vr - xvmaddasp vs2, vs46, alpha_vr - xvmaddasp vs3, vs47, alpha_vr + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -3295,18 +4776,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr + stxvw4x vs33, o0, TBUFFER + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -3320,18 +4841,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs34, alpha_vr - xvmulsp vs1, vs35, alpha_vr + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs34, alpha_vr - xvmaddasp vs1, vs35, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -3345,18 +4906,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs36, alpha_vr - xvmulsp vs1, vs37, alpha_vr + stxvw4x vs36, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif - xvmaddasp vs0, vs36, alpha_vr - xvmaddasp vs1, vs37, alpha_vr + stxvw4x vs37, o0, TBUFFER + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -3370,18 +4971,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs38, alpha_vr - xvmulsp vs1, vs39, alpha_vr + stxvw4x vs38, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs38, alpha_vr - xvmaddasp vs1, vs39, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -3577,16 +5218,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs32, alpha_vr + stxvw4x vs32, o0, TBUFFER -#else + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs32, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -3598,16 +5257,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs33, alpha_vr + stxvw4x vs33, o0, TBUFFER -#else + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs33, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -3619,16 +5296,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs34, alpha_vr + stxvw4x vs34, o0, TBUFFER -#else + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs34, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -3640,16 +5335,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs35, alpha_vr + stxvw4x vs35, o0, TBUFFER -#else + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs35, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -3882,8 +5595,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r - xsmaddasp vs1, vs33, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs33, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -3907,8 +5622,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs34, alpha_r - xsmaddasp vs1, vs35, alpha_r + xsmulsp vs28, vs34, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs35, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -3932,8 +5649,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs36, alpha_r - xsmaddasp vs1, vs37, alpha_r + xsmulsp vs28, vs36, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs37, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -3957,8 +5676,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs38, alpha_r - xsmaddasp vs1, vs39, alpha_r + xsmulsp vs28, vs38, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs39, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -4163,7 +5884,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -4184,7 +5906,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs33, alpha_r + xsmulsp vs28, vs33, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -4205,7 +5928,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs34, alpha_r + xsmulsp vs28, vs34, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -4226,7 +5950,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs35, alpha_r + xsmulsp vs28, vs35, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -4445,22 +6170,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr - xvmulsp vs2, vs34, alpha_vr - xvmulsp vs3, vs35, alpha_vr + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr - xvmaddasp vs2, vs34, alpha_vr - xvmaddasp vs3, vs35, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -4478,22 +6287,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs36, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs37, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs38, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r - xvmulsp vs0, vs36, alpha_vr - xvmulsp vs1, vs37, alpha_vr - xvmulsp vs2, vs38, alpha_vr - xvmulsp vs3, vs39, alpha_vr + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs39, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs36, alpha_vr - xvmaddasp vs1, vs37, alpha_vr - xvmaddasp vs2, vs38, alpha_vr - xvmaddasp vs3, vs39, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -4674,18 +6567,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr + stxvw4x vs33, o0, TBUFFER + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -4699,18 +6632,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs34, alpha_vr - xvmulsp vs1, vs35, alpha_vr + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs34, alpha_vr - xvmaddasp vs1, vs35, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -4870,16 +6843,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs32, alpha_vr + stxvw4x vs32, o0, TBUFFER -#else + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs32, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -4891,16 +6882,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs33, alpha_vr + stxvw4x vs33, o0, TBUFFER -#else + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs33, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -5085,8 +7094,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r - xsmaddasp vs1, vs33, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs33, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -5110,8 +7121,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs34, alpha_r - xsmaddasp vs1, vs35, alpha_r + xsmulsp vs28, vs34, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs35, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -5280,7 +7293,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -5301,7 +7315,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs33, alpha_r + xsmulsp vs28, vs33, alpha_r + xsaddsp vs0, vs0, vs28 #endif @@ -5484,22 +7499,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif + + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif + + stxvw4x vs33, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + #ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 +#endif + + stxvw4x vs34, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr - xvmulsp vs2, vs34, alpha_vr - xvmulsp vs3, vs35, alpha_vr + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER +#ifdef TRMMKERNEL + lxvw4x vs2, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs2, vs2, vs28 +#endif + + stxvw4x vs35, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr - xvmaddasp vs2, vs34, alpha_vr - xvmaddasp vs3, vs35, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs3, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs3, vs3, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -5656,18 +7755,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs32, alpha_vr - xvmulsp vs1, vs33, alpha_vr + stxvw4x vs32, o0, TBUFFER + + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER #else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 +#endif - xvmaddasp vs0, vs32, alpha_vr - xvmaddasp vs1, vs33, alpha_vr + stxvw4x vs33, o0, TBUFFER + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER + + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs1, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs1, vs1, vs28 #endif + + + stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -5809,16 +7948,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef TRMMKERNEL - xvmulsp vs0, vs32, alpha_vr + stxvw4x vs32, o0, TBUFFER -#else + lxsspx vs4, o0, TBUFFER + lxsspx vs5, o4, TBUFFER + lxsspx vs6, o8, TBUFFER + lxsspx vs7, o12, TBUFFER - xvmaddasp vs0, vs32, alpha_vr + xsmulsp vs4, vs4, alpha_r + xsmulsp vs5, vs5, alpha_r + xsmulsp vs6, vs6, alpha_r + xsmulsp vs7, vs7, alpha_r + stxsspx vs4, o0, TBUFFER + stxsspx vs5, o4, TBUFFER + stxsspx vs6, o8, TBUFFER + stxsspx vs7, o12, TBUFFER + +#ifdef TRMMKERNEL + lxvw4x vs0, o0, TBUFFER +#else + lxvw4x vs28, o0, TBUFFER + xvaddsp vs0, vs0, vs28 #endif + + + stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -5979,8 +8136,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r - xsmaddasp vs1, vs33, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 + xsmulsp vs28, vs33, alpha_r + xsaddsp vs1, vs1, vs28 #endif @@ -6131,7 +8290,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else - xsmaddasp vs0, vs32, alpha_r + xsmulsp vs28, vs32, alpha_r + xsaddsp vs0, vs0, vs28 #endif diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S index 5b1c5ca6b..5e607c58f 100644 --- a/kernel/power/strmm_kernel_16x8_power8.S +++ b/kernel/power/strmm_kernel_16x8_power8.S @@ -26,10 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ @@ -81,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __64BIT__ -#define STACKSIZE 320 +#define STACKSIZE 340 #define ALPHA_SP 296(SP) #define FZERO 304(SP) #else @@ -127,10 +128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define alpha_r vs30 -#define alpha_vr vs31 #define o0 0 +#define TBUFFER r13 #define o12 r14 #define o4 r15 #define K1 r16 @@ -138,7 +139,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define L r18 #define T1 r19 #define KK r20 -#define KKK 21 +#define KKK r21 #define I r22 #define J r23 #define AO r24 @@ -204,6 +205,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r16, 264(SP) std r15, 272(SP) std r14, 280(SP) + std r13, 288(SP) #else stw r31, 144(SP) stw r30, 148(SP) @@ -223,6 +225,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stw r16, 204(SP) stw r15, 208(SP) stw r14, 212(SP) + stw r13, 216(SP) #endif // stfd f1, ALPHA_SP @@ -274,17 +277,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. li o16, 16 li o32, 32 li o48, 48 + addi TBUFFER, SP, 320 addi T1, SP, 300 stfs f1, 0(T1) - stfs f1, 4(T1) - stfs f1, 8(T1) - stfs f1,12(T1) - lxsspx vs28, 0, T1 + lxsspx alpha_r, 0, T1 - xxspltw alpha_r, vs28 , 0 - lxvw4x alpha_vr, 0, T1 @@ -335,6 +334,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r16, 264(SP) ld r15, 272(SP) ld r14, 280(SP) + ld r13, 288(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) @@ -354,6 +354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lwz r16, 204(SP) lwz r15, 208(SP) lwz r14, 212(SP) + lwz r13, 216(SP) #endif addi SP, SP, STACKSIZE diff --git a/kernel/power/strmm_logic_16x8_power8.S b/kernel/power/strmm_logic_16x8_power8.S index 0d6d04858..8ec11f1ef 100644 --- a/kernel/power/strmm_logic_16x8_power8.S +++ b/kernel/power/strmm_logic_16x8_power8.S @@ -26,14 +26,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/14 Werner Saar (wernsaar@googlemail.com) +* 2016/03/18 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ - srawi. J, N, 3 ble .LSTRMM_L8_END diff --git a/param.h b/param.h index 980650e09..370d10b9a 100644 --- a/param.h +++ b/param.h @@ -1977,12 +1977,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 960 +#define SGEMM_DEFAULT_P 480 #define DGEMM_DEFAULT_P 480 #define CGEMM_DEFAULT_P 480 #define ZGEMM_DEFAULT_P 240 -#define SGEMM_DEFAULT_Q 720 +#define SGEMM_DEFAULT_Q 1440 #define DGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 720 #define ZGEMM_DEFAULT_Q 360 From 84b92e6373193e8c33348f61dd33358110307c6a Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 20 Mar 2016 11:06:06 +0100 Subject: [PATCH 06/48] added optimized ddot kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/ddot.c | 132 +++++++++++++++++++++++ kernel/power/ddot_microk_power8.c | 170 ++++++++++++++++++++++++++++++ 3 files changed, 303 insertions(+), 1 deletion(-) create mode 100644 kernel/power/ddot.c create mode 100644 kernel/power/ddot_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index eaa9f26ed..1e0c6a595 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -121,7 +121,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #ZCOPYKERNEL = ../arm/zcopy.c # #SDOTKERNEL = ../arm/dot.c -#DDOTKERNEL = ../arm/dot.c +DDOTKERNEL = ddot.c #CDOTKERNEL = ../arm/zdot.c #ZDOTKERNEL = ../arm/zdot.c # diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c new file mode 100644 index 000000000..488d2a7b3 --- /dev/null +++ b/kernel/power/ddot.c @@ -0,0 +1,132 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + + +#if defined(POWER8) +#include "ddot_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + + i+=8 ; + + } + *d += dot; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + ddot_kernel_8(n1, x, y , &dot ); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = y[iy] * x[ix] ; + FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; + + FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; + FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + + temp1 += m1+m3; + temp2 += m2+m4; + + i+=4 ; + + } + + while(i < n) + { + + temp1 += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + dot = temp1 + temp2; + return(dot); + +} + + diff --git a/kernel/power/ddot_microk_power8.c b/kernel/power/ddot_microk_power8.c new file mode 100644 index 000000000..085c163d4 --- /dev/null +++ b/kernel/power/ddot_microk_power8.c @@ -0,0 +1,170 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + "xxlxor 32,32,32 \n\t" + "xxlxor 33,33,33 \n\t" + "xxlxor 34,34,34 \n\t" + "xxlxor 35,35,35 \n\t" + "xxlxor 36,36,36 \n\t" + "xxlxor 37,37,37 \n\t" + "xxlxor 38,38,38 \n\t" + "xxlxor 39,39,39 \n\t" + + "dcbt %2, %12 \n\t" + "dcbt %3, %12 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 48, 0, %3 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 49, %5, %3 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 50, %6, %3 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 51, %7, %3 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 52, %8, %3 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 53, %9, %3 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 54, %10, %3 \n\t" + "lxvd2x 47, %11, %2 \n\t" + "lxvd2x 55, %11, %3 \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %12 \n\t" + "dcbt %3, %12 \n\t" + + "xvmaddadp 32, 40, 48 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 48, 0, %3 \n\t" + "xvmaddadp 33, 41, 49 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 49, %5, %3 \n\t" + "xvmaddadp 34, 42, 50 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 50, %6, %3 \n\t" + "xvmaddadp 35, 43, 51 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 51, %7, %3 \n\t" + "xvmaddadp 36, 44, 52 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 52, %8, %3 \n\t" + "xvmaddadp 37, 45, 53 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 53, %9, %3 \n\t" + "xvmaddadp 38, 46, 54 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 54, %10, %3 \n\t" + "xvmaddadp 39, 47, 55 \n\t" + + "lxvd2x 47, %11, %2 \n\t" + "lxvd2x 55, %11, %3 \n\t" + + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmaddadp 32, 40, 48 \n\t" + "xvmaddadp 33, 41, 49 \n\t" + "xvmaddadp 34, 42, 50 \n\t" + "xvmaddadp 35, 43, 51 \n\t" + "xvmaddadp 36, 44, 52 \n\t" + "xvmaddadp 37, 45, 53 \n\t" + "xvmaddadp 38, 46, 54 \n\t" + "xvmaddadp 39, 47, 55 \n\t" + + "xvadddp 32, 32, 33 \n\t" + "xvadddp 34, 34, 35 \n\t" + "xvadddp 36, 36, 37 \n\t" + "xvadddp 38, 38, 39 \n\t" + + "xvadddp 32, 32, 34 \n\t" + "xvadddp 36, 36, 38 \n\t" + + "xvadddp 32, 32, 36 \n\t" + + "xxswapd 33, 32 \n\t" + + "xsadddp 32, 32, 33 \n\t" + + "stxsdx 32, 0, %4 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (y1), // 3 + "r" (dot), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112), // 11 + "r" (pre) // 12 + : "cr0", "%0", "%2" , "%3", "memory" + ); + +} + + From cd9fafc054e767433f15a036be69cab0eae56a40 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 20 Mar 2016 11:19:27 +0100 Subject: [PATCH 07/48] ddot for POWER8: updated licence information --- kernel/power/ddot.c | 9 ++++++++- kernel/power/ddot_microk_power8.c | 10 +++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c index 488d2a7b3..cef60a2e5 100644 --- a/kernel/power/ddot.c +++ b/kernel/power/ddot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project +Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,6 +25,13 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/************************************************************************************** +* 2016/03/20 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ #include "common.h" diff --git a/kernel/power/ddot_microk_power8.c b/kernel/power/ddot_microk_power8.c index 085c163d4..b88049212 100644 --- a/kernel/power/ddot_microk_power8.c +++ b/kernel/power/ddot_microk_power8.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project +Copyright (c) 2013-2016, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,6 +25,14 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ +/************************************************************************************** +* 2016/03/20 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + #define HAVE_KERNEL_8 1 static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); From fc3a558515c024bcffd5e65a6f2d72ae3e6a3eb9 Mon Sep 17 00:00:00 2001 From: theoractice Date: Sun, 20 Mar 2016 18:58:18 +0800 Subject: [PATCH 08/48] Fix a minor compiler error in VisualStudio with CMake --- common_x86.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_x86.h b/common_x86.h index ab9f22b0d..4363fb2f4 100644 --- a/common_x86.h +++ b/common_x86.h @@ -62,7 +62,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ #if defined(_MSC_VER) && !defined(__clang__) // use intrinsic instead of inline assembly - ret = _InterlockedExchange(address, 1); + ret = _InterlockedExchange((volatile LONG *)address, 1); // inline assembly /*__asm { mov eax, address From 9e4584d069891c136194bfa02f796deb4f990a08 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 21 Mar 2016 10:12:07 +0100 Subject: [PATCH 09/48] added optimized zdot kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/zdot.c | 167 +++++++++++++++++++++++ kernel/power/zdot_microk_power8.c | 219 ++++++++++++++++++++++++++++++ 3 files changed, 387 insertions(+), 1 deletion(-) create mode 100644 kernel/power/zdot.c create mode 100644 kernel/power/zdot_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 1e0c6a595..b8a854960 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -123,7 +123,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SDOTKERNEL = ../arm/dot.c DDOTKERNEL = ddot.c #CDOTKERNEL = ../arm/zdot.c -#ZDOTKERNEL = ../arm/zdot.c +ZDOTKERNEL = zdot.c # #SNRM2KERNEL = ../arm/nrm2.c #DNRM2KERNEL = ../arm/nrm2.c diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c new file mode 100644 index 000000000..1205b34b6 --- /dev/null +++ b/kernel/power/zdot.c @@ -0,0 +1,167 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" +#include + + +#if defined(POWER8) +#include "zdot_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); + +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 }; + BLASLONG j=0; + + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[2] += x[j] * y[j+1] ; + dot[3] += x[j+1] * y[j] ; + + dot[0] += x[j+2] * y[j+2] ; + dot[1] += x[j+3] * y[j+3] ; + dot[2] += x[j+2] * y[j+3] ; + dot[3] += x[j+3] * y[j+2] ; + + dot[0] += x[j+4] * y[j+4] ; + dot[1] += x[j+5] * y[j+5] ; + dot[2] += x[j+4] * y[j+5] ; + dot[3] += x[j+5] * y[j+4] ; + + dot[0] += x[j+6] * y[j+6] ; + dot[1] += x[j+7] * y[j+7] ; + dot[2] += x[j+6] * y[j+7] ; + dot[3] += x[j+7] * y[j+6] ; + + j+=8; + i+=4; + + } + d[0] = dot[0]; + d[1] = dot[1]; + d[2] = dot[2]; + d[3] = dot[3]; + +} + +#endif + +FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + BLASLONG ix,iy; + FLOAT _Complex result; + FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; + + if ( n <= 0 ) + { + __real__ result = 0.0 ; + __imag__ result = 0.0 ; + return(result); + + } + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -8; + + if ( n1 ) + zdot_kernel_8(n1, x, y , dot ); + + i = n1; + BLASLONG j = i * 2; + + while( i < n ) + { + + dot[0] += x[j] * y[j] ; + dot[1] += x[j+1] * y[j+1] ; + dot[2] += x[j] * y[j+1] ; + dot[3] += x[j+1] * y[j] ; + + j+=2; + i++ ; + + } + + + } + else + { + i=0; + ix=0; + iy=0; + inc_x <<= 1; + inc_y <<= 1; + while(i < n) + { + + dot[0] += x[ix] * y[iy] ; + dot[1] += x[ix+1] * y[iy+1] ; + dot[2] += x[ix] * y[iy+1] ; + dot[3] += x[ix+1] * y[iy] ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + } + +#if !defined(CONJ) + __real__ result = dot[0] - dot[1]; + __imag__ result = dot[2] + dot[3]; +#else + __real__ result = dot[0] + dot[1]; + __imag__ result = dot[2] - dot[3]; + +#endif + + return(result); + +} + + diff --git a/kernel/power/zdot_microk_power8.c b/kernel/power/zdot_microk_power8.c new file mode 100644 index 000000000..296d3d469 --- /dev/null +++ b/kernel/power/zdot_microk_power8.c @@ -0,0 +1,219 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + "xxlxor 32,32,32 \n\t" + "xxlxor 33,33,33 \n\t" + "xxlxor 34,34,34 \n\t" + "xxlxor 35,35,35 \n\t" + "xxlxor 36,36,36 \n\t" + "xxlxor 37,37,37 \n\t" + "xxlxor 38,38,38 \n\t" + "xxlxor 39,39,39 \n\t" + + "dcbt %2, %8 \n\t" + "dcbt %3, %8 \n\t" + + "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i + "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i + "lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i + "lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i + "lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i + "lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i + "lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i + "lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i + + "xxswapd 52,48 \n\t" // y0_i, y0_r + "xxswapd 53,49 \n\t" // y1_i, y1_r + "xxswapd 54,50 \n\t" // y2_i, y2_r + "xxswapd 55,51 \n\t" // y3_i, y3_r + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + + "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i + "lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i + "lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i + "lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i + "lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i + "lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i + "lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i + "lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i + + "xxswapd 60,56 \n\t" // y0_i, y0_r + "xxswapd 61,57 \n\t" // y1_i, y1_r + "xxswapd 62,58 \n\t" // y2_i, y2_r + "xxswapd 63,59 \n\t" // y3_i, y3_r + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %8 \n\t" + "dcbt %3, %8 \n\t" + + "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i + "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i + "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i + "lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i + + "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i + "lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i + "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i + "lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i + + "xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r + "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i + "xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r + "lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i + + "xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r + "lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i + "xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r + "lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i + + "xxswapd 52,48 \n\t" // y0_i, y0_r + "xxswapd 53,49 \n\t" // y1_i, y1_r + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + "xxswapd 54,50 \n\t" // y2_i, y2_r + "xxswapd 55,51 \n\t" // y3_i, y3_r + + "xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i + "lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i + "xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i + "lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i + "xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i + "lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i + "xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i + "lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i + + "xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r + "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i + "xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r + "lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i + "xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r + "lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i + "xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r + "lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i + + "xxswapd 60,56 \n\t" // y0_i, y0_r + "xxswapd 61,57 \n\t" // y1_i, y1_r + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + "xxswapd 62,58 \n\t" // y2_i, y2_r + "xxswapd 63,59 \n\t" // y3_i, y3_r + + "addic. %0 , %0 , -8 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i + "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i + + "xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r + "xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r + + "xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i + "xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i + "xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i + "xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i + + "xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r + "xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r + "xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r + "xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r + + + "xvadddp 32, 32, 34 \n\t" + "xvadddp 36, 36, 38 \n\t" + + "xvadddp 33, 33, 35 \n\t" + "xvadddp 37, 37, 39 \n\t" + + "xvadddp 32, 32, 36 \n\t" + "xvadddp 33, 33, 37 \n\t" + + "stxvd2x 32, 0, %4 \n\t" + "stxvd2x 33, %5, %4 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (y1), // 3 + "r" (dot), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (pre) // 8 + : "cr0", "%0", "%2" , "%3", "memory" + ); + +} + + From 11c44dede1aeb63a1b444ea72f3464a41cfd6ef5 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 21 Mar 2016 13:18:23 +0100 Subject: [PATCH 10/48] added optimized sdot kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/sdot.c | 126 +++++++++++++++++++++ kernel/power/sdot_microk_power8.c | 179 ++++++++++++++++++++++++++++++ 3 files changed, 306 insertions(+), 1 deletion(-) create mode 100644 kernel/power/sdot.c create mode 100644 kernel/power/sdot_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index b8a854960..ed426314f 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -120,7 +120,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #CCOPYKERNEL = ../arm/zcopy.c #ZCOPYKERNEL = ../arm/zcopy.c # -#SDOTKERNEL = ../arm/dot.c +SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c #CDOTKERNEL = ../arm/zdot.c ZDOTKERNEL = zdot.c diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c new file mode 100644 index 000000000..52fb1fe24 --- /dev/null +++ b/kernel/power/sdot.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "sdot_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + + i+=8 ; + + } + *d += dot; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + + if ( n1 ) + sdot_kernel_16(n1, x, y , &dot ); + + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + BLASLONG n1 = n & -2; + + while(i < n1) + { + + dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + ix += inc_x*2 ; + iy += inc_y*2 ; + i+=2 ; + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c new file mode 100644 index 000000000..6dd588acd --- /dev/null +++ b/kernel/power/sdot_microk_power8.c @@ -0,0 +1,179 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/21 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); + +static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG pre = 384; + FLOAT tempdot[4]; + + + __asm__ __volatile__ + ( + "xxlxor 32,32,32 \n\t" + "xxlxor 33,33,33 \n\t" + "xxlxor 34,34,34 \n\t" + "xxlxor 35,35,35 \n\t" + "xxlxor 36,36,36 \n\t" + "xxlxor 37,37,37 \n\t" + "xxlxor 38,38,38 \n\t" + "xxlxor 39,39,39 \n\t" + + "dcbt %2, %12 \n\t" + "dcbt %3, %12 \n\t" + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 48, 0, %3 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 49, %5, %3 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 50, %6, %3 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 51, %7, %3 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 52, %8, %3 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 53, %9, %3 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 54, %10, %3 \n\t" + "lxvw4x 47, %11, %2 \n\t" + "lxvw4x 55, %11, %3 \n\t" + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %12 \n\t" + "dcbt %3, %12 \n\t" + + "xvmaddasp 32, 40, 48 \n\t" + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 48, 0, %3 \n\t" + "xvmaddasp 33, 41, 49 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 49, %5, %3 \n\t" + "xvmaddasp 34, 42, 50 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 50, %6, %3 \n\t" + "xvmaddasp 35, 43, 51 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 51, %7, %3 \n\t" + "xvmaddasp 36, 44, 52 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 52, %8, %3 \n\t" + "xvmaddasp 37, 45, 53 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 53, %9, %3 \n\t" + "xvmaddasp 38, 46, 54 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 54, %10, %3 \n\t" + "xvmaddasp 39, 47, 55 \n\t" + + "lxvw4x 47, %11, %2 \n\t" + "lxvw4x 55, %11, %3 \n\t" + + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmaddasp 32, 40, 48 \n\t" + "xvmaddasp 33, 41, 49 \n\t" + "xvmaddasp 34, 42, 50 \n\t" + "xvmaddasp 35, 43, 51 \n\t" + "xvmaddasp 36, 44, 52 \n\t" + "xvmaddasp 37, 45, 53 \n\t" + "xvmaddasp 38, 46, 54 \n\t" + "xvmaddasp 39, 47, 55 \n\t" + + "xvaddsp 32, 32 , 33 \n\t" + "xvaddsp 34, 34 , 35 \n\t" + "xvaddsp 36, 36 , 37 \n\t" + "xvaddsp 38, 38 , 39 \n\t" + + "xvaddsp 32, 32 , 34 \n\t" + "xvaddsp 36, 36 , 38 \n\t" + + "xvaddsp 32, 32 , 36 \n\t" + + "stxvw4x 32, 0 , %4 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (y1), // 3 + "r" (tempdot), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112), // 11 + "r" (pre) // 12 + : "cr0", "%0", "%2" , "%3", "memory" + ); + + *dot = tempdot[0] + tempdot[1] + tempdot[2] + tempdot[3]; + + +} + + From 61cf8f74d9e23757844dc1e0d2a04e641a23fb7f Mon Sep 17 00:00:00 2001 From: theoractice Date: Tue, 22 Mar 2016 19:14:54 +0800 Subject: [PATCH 11/48] Fix access violation on Windows while static linking --- CONTRIBUTORS.md | 5 +++-- driver/others/memory.c | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index da56c0758..ebe52ea8a 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -147,5 +147,6 @@ In chronological order: * [2016-03-14] Additional functional Assembly Kernels for Cortex-A57 * [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57 -* [Your name or handle] <[email or website]> - * [Date] [Brief summary of your changes] +* theoractice + * [2016-03-20] Fix compiler error in VisualStudio with CMake + * [2016-03-22] Fix access violation on Windows while static linking diff --git a/driver/others/memory.c b/driver/others/memory.c index e64781740..c4f261850 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1452,6 +1452,26 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser } return TRUE; } + +/* + This is to allow static linking. + Code adapted from Google performance tools: + https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc + Reference: + https://sourceware.org/ml/pthreads-win32/2008/msg00028.html +*/ +static int on_process_term(void) +{ + gotoblas_quit(); + return 0; +} +#pragma comment(linker, "/INCLUDE:__tls_used") +#pragma data_seg(push, old_seg) +#pragma data_seg(".CRT$XLB") +static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; +#pragma data_seg(".CRT$XTU") +static int(*p_process_term)(void) = on_process_term; +#pragma data_seg(pop, old_seg) #endif #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) From aa744dfa59223419adc35a0e5d1fe11d3a11a1aa Mon Sep 17 00:00:00 2001 From: Theoractice Date: Tue, 22 Mar 2016 20:02:37 +0800 Subject: [PATCH 12/48] Update memory.c --- driver/others/memory.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/driver/others/memory.c b/driver/others/memory.c index c4f261850..e89f5c328 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1459,13 +1459,18 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc Reference: https://sourceware.org/ml/pthreads-win32/2008/msg00028.html + http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp */ static int on_process_term(void) { gotoblas_quit(); return 0; } +#ifdef _WIN64 +#pragma comment(linker, "/INCLUDE:_tls_used") +#else #pragma comment(linker, "/INCLUDE:__tls_used") +#endif #pragma data_seg(push, old_seg) #pragma data_seg(".CRT$XLB") static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; From 0664ba4c97fe097595b8749e78b326bf0dbbb4f2 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 22 Mar 2016 14:50:03 +0100 Subject: [PATCH 13/48] added optimized daxpy kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/daxpy.c | 136 +++++++++++++++++++ kernel/power/daxpy_microk_power8.c | 201 +++++++++++++++++++++++++++++ 3 files changed, 338 insertions(+), 1 deletion(-) create mode 100644 kernel/power/daxpy.c create mode 100644 kernel/power/daxpy_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index ed426314f..d3c2e694c 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -111,7 +111,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #ZASUMKERNEL = ../arm/zasum.c # #SAXPYKERNEL = ../arm/axpy.c -#DAXPYKERNEL = ../arm/axpy.c +DAXPYKERNEL = daxpy.c #CAXPYKERNEL = ../arm/zaxpy.c #ZAXPYKERNEL = ../arm/zaxpy.c # diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c new file mode 100644 index 000000000..4365bd88d --- /dev/null +++ b/kernel/power/daxpy.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + + +#if defined(POWER8) +#include "daxpy_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG register i = 0; + FLOAT a = *alpha; + + while(i < n) + { + y[i] += a * x[i]; + y[i+1] += a * x[i+1]; + y[i+2] += a * x[i+2]; + y[i+3] += a * x[i+3]; + y[i+4] += a * x[i+4]; + y[i+5] += a * x[i+5]; + y[i+6] += a * x[i+6]; + y[i+7] += a * x[i+7]; + i+=8 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT a2[4]; + a2[0]=da; + a2[1]=da; + a2[2]=da; + a2[3]=da; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + daxpy_kernel_8(n1, x, y , a2 ); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return(0); + + + } + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/power/daxpy_microk_power8.c b/kernel/power/daxpy_microk_power8.c new file mode 100644 index 000000000..bb3f73aca --- /dev/null +++ b/kernel/power/daxpy_microk_power8.c @@ -0,0 +1,201 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/22 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +#define HAVE_KERNEL_8 1 +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *y2=y+1; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "lxsdx 33, %5, %4 \n\t" + "xxspltd 32, 33, 0 \n\t" + "addi %8, %8, -8 \n\t" + + "dcbt %2, %9 \n\t" + "dcbt %3, %9 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + + "lxvd2x 48, 0, %3 \n\t" + "lxvd2x 49, %5, %3 \n\t" + "lxvd2x 50, %6, %3 \n\t" + "lxvd2x 51, %7, %3 \n\t" + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + "lxvd2x 44, 0, %2 \n\t" + "lxvd2x 45, %5, %2 \n\t" + "lxvd2x 46, %6, %2 \n\t" + "lxvd2x 47, %7, %2 \n\t" + + "lxvd2x 52, 0, %3 \n\t" + "lxvd2x 53, %5, %3 \n\t" + "lxvd2x 54, %6, %3 \n\t" + "lxvd2x 55, %7, %3 \n\t" + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %9 \n\t" + "dcbt %3, %9 \n\t" + + "xvmaddadp 48, 40, 32 \n\t" + "xvmaddadp 49, 41, 32 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + + "stxvd2x 48, 0, %8 \n\t" + "stxvd2x 49, %5, %8 \n\t" + + "xvmaddadp 50, 42, 32 \n\t" + "xvmaddadp 51, 43, 32 \n\t" + + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + + "stxvd2x 50, %6, %8 \n\t" + "stxvd2x 51, %7, %8 \n\t" + + "lxvd2x 48, 0, %3 \n\t" + "lxvd2x 49, %5, %3 \n\t" + "lxvd2x 50, %6, %3 \n\t" + "lxvd2x 51, %7, %3 \n\t" + + "addi %2, %2, 64 \n\t" + "addi %8, %8, 64 \n\t" + + "xvmaddadp 52, 44, 32 \n\t" + "addi %3, %3, 64 \n\t" + "xvmaddadp 53, 45, 32 \n\t" + + "lxvd2x 44, 0, %2 \n\t" + "lxvd2x 45, %5, %2 \n\t" + + "stxvd2x 52, 0, %8 \n\t" + "stxvd2x 53, %5, %8 \n\t" + + "xvmaddadp 54, 46, 32 \n\t" + "xvmaddadp 55, 47, 32 \n\t" + + "lxvd2x 46, %6, %2 \n\t" + "lxvd2x 47, %7, %2 \n\t" + + "stxvd2x 54, %6, %8 \n\t" + "stxvd2x 55, %7, %8 \n\t" + + "addi %2, %2, 64 \n\t" + "addi %8, %8, 64 \n\t" + + "lxvd2x 52, 0, %3 \n\t" + "lxvd2x 53, %5, %3 \n\t" + "lxvd2x 54, %6, %3 \n\t" + "lxvd2x 55, %7, %3 \n\t" + + "addi %3, %3, 64 \n\t" + + + "addic. %0 , %0 , -16 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + + "xvmaddadp 48, 40, 32 \n\t" + "xvmaddadp 49, 41, 32 \n\t" + "xvmaddadp 50, 42, 32 \n\t" + "xvmaddadp 51, 43, 32 \n\t" + + "xvmaddadp 52, 44, 32 \n\t" + "xvmaddadp 53, 45, 32 \n\t" + "xvmaddadp 54, 46, 32 \n\t" + "xvmaddadp 55, 47, 32 \n\t" + + "stxvd2x 48, 0, %8 \n\t" + "stxvd2x 49, %5, %8 \n\t" + "stxvd2x 50, %6, %8 \n\t" + "stxvd2x 51, %7, %8 \n\t" + + "addi %8, %8, 64 \n\t" + + "stxvd2x 52, 0, %8 \n\t" + "stxvd2x 53, %5, %8 \n\t" + "stxvd2x 54, %6, %8 \n\t" + "stxvd2x 55, %7, %8 \n\t" + + "addi %8, %8, 64 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (y1), // 3 + "r" (alpha), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (y2), // 8 + "r" (pre) // 9 + : "cr0", "%0", "%2" , "%3", "%8", "memory" + ); + +} + + From 53bfc83c266d44e71650a8238b693280e32c153b Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 22 Mar 2016 11:37:35 -0400 Subject: [PATCH 14/48] Update appveyor version. --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 172a49b42..d9359e99a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,4 +1,4 @@ -version: 0.2.15.{build} +version: 0.2.17.{build} #environment: From 55eda3813b48fdc82bc7e6ade28f51b3a3236d82 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 23 Mar 2016 11:20:23 +0100 Subject: [PATCH 15/48] added optimized zaxpy kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/zaxpy.c | 140 ++++++++++++++++ kernel/power/zaxpy_microk_power8.c | 250 +++++++++++++++++++++++++++++ 3 files changed, 391 insertions(+), 1 deletion(-) create mode 100644 kernel/power/zaxpy.c create mode 100644 kernel/power/zaxpy_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index d3c2e694c..1f9cef0e5 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -113,7 +113,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = daxpy.c #CAXPYKERNEL = ../arm/zaxpy.c -#ZAXPYKERNEL = ../arm/zaxpy.c +ZAXPYKERNEL = zaxpy.c # #SCOPYKERNEL = ../arm/copy.c #DCOPYKERNEL = ../arm/copy.c diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c new file mode 100644 index 000000000..0ee0c1bf9 --- /dev/null +++ b/kernel/power/zaxpy.c @@ -0,0 +1,140 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + + +#if defined(POWER8) +#include "zaxpy_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_4 + +static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + BLASLONG register i = 0; + BLASLONG register ix = 0; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; + y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; + y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; +#endif + + ix+=4 ; + i+=2 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT da[4]; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + { + da[0] = da_r; + da[1] = da_r; + da[2] = da_i; + da[3] = da_i; + zaxpy_kernel_4(n1, x, y , da ); + ix = 2 * n1; + } + i = n1; + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + i++ ; + ix += 2; + + } + return(0); + + + } + + inc_x *=2; + inc_y *=2; + + while(i < n) + { + +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/power/zaxpy_microk_power8.c b/kernel/power/zaxpy_microk_power8.c new file mode 100644 index 000000000..c8a529fd9 --- /dev/null +++ b/kernel/power/zaxpy_microk_power8.c @@ -0,0 +1,250 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/23 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +#define HAVE_KERNEL_4 1 +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); + +static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *y2=y+1; + BLASLONG pre = 384; + +#if !defined(CONJ) + FLOAT mvec[2] = { -1.0, 1.0 }; +#else + FLOAT mvec[2] = { 1.0, -1.0 }; +#endif + + + __asm__ __volatile__ + ( + + "lxsdx 34, 0 , %4 \n\t" // alpha_r + "lxsdx 35, %5, %4 \n\t" // alpha_i + "xxspltd 32, 34, 0 \n\t" + "xxspltd 33, 35, 0 \n\t" + + "lxvd2x 36, 0, %9 \n\t" // mvec + +#if !defined(CONJ) + "xvmuldp 33, 33 , 36 \n\t" // alpha_i * mvec +#else + "xvmuldp 32, 32 , 36 \n\t" // alpha_r * mvec +#endif + + "addi %8, %8, -8 \n\t" + + "dcbt %2, %10 \n\t" + "dcbt %3, %10 \n\t" + + + "lxvd2x 40, 0, %2 \n\t" // x0 + "lxvd2x 41, %5, %2 \n\t" // x1 + "lxvd2x 42, %6, %2 \n\t" // x2 + "lxvd2x 43, %7, %2 \n\t" // x3 + + "lxvd2x 48, 0, %3 \n\t" // y0 + "lxvd2x 49, %5, %3 \n\t" // y1 + "lxvd2x 50, %6, %3 \n\t" // y2 + "lxvd2x 51, %7, %3 \n\t" // y3 + + "xxswapd 56, 40 \n\t" // exchange real and imag part + "xxswapd 57, 41 \n\t" // exchange real and imag part + "xxswapd 58, 42 \n\t" // exchange real and imag part + "xxswapd 59, 43 \n\t" // exchange real and imag part + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + "lxvd2x 44, 0, %2 \n\t" // x4 + "lxvd2x 45, %5, %2 \n\t" // x5 + "lxvd2x 46, %6, %2 \n\t" // x6 + "lxvd2x 47, %7, %2 \n\t" // x7 + + "lxvd2x 52, 0, %3 \n\t" // y4 + "lxvd2x 53, %5, %3 \n\t" // y5 + "lxvd2x 54, %6, %3 \n\t" // y6 + "lxvd2x 55, %7, %3 \n\t" // y7 + + "xxswapd 60, 44 \n\t" // exchange real and imag part + "xxswapd 61, 45 \n\t" // exchange real and imag part + "xxswapd 62, 46 \n\t" // exchange real and imag part + "xxswapd 63, 47 \n\t" // exchange real and imag part + + "addi %2, %2, 64 \n\t" + "addi %3, %3, 64 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %10 \n\t" + "dcbt %3, %10 \n\t" + + "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddadp 49, 41, 32 \n\t" + "lxvd2x 40, 0, %2 \n\t" // x0 + "lxvd2x 41, %5, %2 \n\t" // x1 + "xvmaddadp 50, 42, 32 \n\t" + "xvmaddadp 51, 43, 32 \n\t" + "lxvd2x 42, %6, %2 \n\t" // x2 + "lxvd2x 43, %7, %2 \n\t" // x3 + + "xvmaddadp 52, 44, 32 \n\t" + "addi %2, %2, 64 \n\t" + "xvmaddadp 53, 45, 32 \n\t" + "lxvd2x 44, 0, %2 \n\t" // x4 + "lxvd2x 45, %5, %2 \n\t" // x5 + "xvmaddadp 54, 46, 32 \n\t" + "xvmaddadp 55, 47, 32 \n\t" + "lxvd2x 46, %6, %2 \n\t" // x6 + "lxvd2x 47, %7, %2 \n\t" // x7 + + "xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "addi %2, %2, 64 \n\t" + "xvmaddadp 49, 57, 33 \n\t" + "xvmaddadp 50, 58, 33 \n\t" + "xvmaddadp 51, 59, 33 \n\t" + + "xvmaddadp 52, 60, 33 \n\t" + "xvmaddadp 53, 61, 33 \n\t" + "xvmaddadp 54, 62, 33 \n\t" + "xvmaddadp 55, 63, 33 \n\t" + + "stxvd2x 48, 0, %8 \n\t" + "stxvd2x 49, %5, %8 \n\t" + "stxvd2x 50, %6, %8 \n\t" + "stxvd2x 51, %7, %8 \n\t" + + "addi %8, %8, 64 \n\t" + + "stxvd2x 52, 0, %8 \n\t" + "stxvd2x 53, %5, %8 \n\t" + "stxvd2x 54, %6, %8 \n\t" + "stxvd2x 55, %7, %8 \n\t" + + "addi %8, %8, 64 \n\t" + + "xxswapd 56, 40 \n\t" // exchange real and imag part + "xxswapd 57, 41 \n\t" // exchange real and imag part + "lxvd2x 48, 0, %3 \n\t" // y0 + "lxvd2x 49, %5, %3 \n\t" // y1 + "xxswapd 58, 42 \n\t" // exchange real and imag part + "xxswapd 59, 43 \n\t" // exchange real and imag part + "lxvd2x 50, %6, %3 \n\t" // y2 + "lxvd2x 51, %7, %3 \n\t" // y3 + + "xxswapd 60, 44 \n\t" // exchange real and imag part + "addi %3, %3, 64 \n\t" + "xxswapd 61, 45 \n\t" // exchange real and imag part + "lxvd2x 52, 0, %3 \n\t" // y4 + "lxvd2x 53, %5, %3 \n\t" // y5 + "xxswapd 62, 46 \n\t" // exchange real and imag part + "xxswapd 63, 47 \n\t" // exchange real and imag part + "lxvd2x 54, %6, %3 \n\t" // y6 + "lxvd2x 55, %7, %3 \n\t" // y7 + + "addi %3, %3, 64 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddadp 49, 41, 32 \n\t" + "xvmaddadp 50, 42, 32 \n\t" + "xvmaddadp 51, 43, 32 \n\t" + + "xvmaddadp 52, 44, 32 \n\t" + "xvmaddadp 53, 45, 32 \n\t" + "xvmaddadp 54, 46, 32 \n\t" + "xvmaddadp 55, 47, 32 \n\t" + + "xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "xvmaddadp 49, 57, 33 \n\t" + "xvmaddadp 50, 58, 33 \n\t" + "xvmaddadp 51, 59, 33 \n\t" + + "xvmaddadp 52, 60, 33 \n\t" + "xvmaddadp 53, 61, 33 \n\t" + "xvmaddadp 54, 62, 33 \n\t" + "xvmaddadp 55, 63, 33 \n\t" + + + "stxvd2x 48, 0, %8 \n\t" + "stxvd2x 49, %5, %8 \n\t" + "stxvd2x 50, %6, %8 \n\t" + "stxvd2x 51, %7, %8 \n\t" + + "addi %8, %8, 64 \n\t" + + "stxvd2x 52, 0, %8 \n\t" + "stxvd2x 53, %5, %8 \n\t" + "stxvd2x 54, %6, %8 \n\t" + "stxvd2x 55, %7, %8 \n\t" + + "addi %8, %8, 64 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (y1), // 3 + "r" (alpha), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (y2), // 8 + "r" (mvec), // 9 + "r" (pre) // 10 + : "cr0", "%0", "%2" , "%3", "%8", "memory" + ); + +} + + From 3b5ffb49d32100057771c309e373c733ae0d5f5e Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 17 Mar 2016 10:23:51 +0530 Subject: [PATCH 16/48] Cortex-A57: Improve DGEMM 8x4 Implementation --- kernel/arm64/dgemm_kernel_8x4.S | 356 +++++++++++++++++--------------- 1 file changed, 194 insertions(+), 162 deletions(-) diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S index a607fecc4..33e076e6a 100755 --- a/kernel/arm64/dgemm_kernel_8x4.S +++ b/kernel/arm64/dgemm_kernel_8x4.S @@ -52,12 +52,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha0 d10 #define alphaV0 v10.d[0] -#define alpha1 d11 -#define alphaV1 v11.d[0] -#define alpha2 d14 -#define alphaV2 v14.d[0] -#define alpha3 d15 -#define alphaV3 v15.d[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 // 00 origM // 01 origN @@ -74,8 +72,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 +// 15 pCRow3 +// 16 pA // 17 // 18 must save // 19 must save @@ -100,14 +98,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_2, pA1_3 //v06 pA1_4, pA1_5 //v07 pA1_6, pA1_7 -//v08 must save pB0_0, pB0_1 -//v09 must save pB0_2, pB0_3 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB1_0, pB1_1 -//v13 must save pB1_2, pB1_3 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 --> ALPHA0 +//v11 must save pB0_3 +//v12 must save pB1_0 +//v13 must save pB1_1 +//v14 must save pB1_2 +//v15 must save pB1_3 //v16 must save C00, C01 //v17 must save C02, C03 //v18 C04, C05 @@ -149,244 +147,257 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 - ldp d8, d9, [pB] - add pB, pB, #16 - ldp d10, d11, [pB] - add pB, pB, #16 + ldp q0, q1, [pA], #32 + + ldp d8, d9, [pB], #16 fmul v16.2d, v0.2d, v8.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] + fmul v20.2d, v0.2d, v9.2d[0] - fmul v18.2d, v2.2d, v8.2d[0] - fmul v19.2d, v3.2d, v8.2d[0] + ldp d10, d11, [pB], #16 - fmul v20.2d, v0.2d, v9.2d[0] + fmul v17.2d, v1.2d, v8.2d[0] fmul v21.2d, v1.2d, v9.2d[0] - fmul v22.2d, v2.2d, v9.2d[0] - fmul v23.2d, v3.2d, v9.2d[0] + ldp q2, q3, [pA], #32 fmul v24.2d, v0.2d, v10.2d[0] + fmul v28.2d, v0.2d, v11.2d[0] + + ldp q4, q5, [pA], #32 + fmul v25.2d, v1.2d, v10.2d[0] + fmul v29.2d, v1.2d, v11.2d[0] + + ldp d12, d13, [pB], #16 + + fmul v18.2d, v2.2d, v8.2d[0] + fmul v22.2d, v2.2d, v9.2d[0] + + ldp d14, d15, [pB], #16 fmul v26.2d, v2.2d, v10.2d[0] + fmul v30.2d, v2.2d, v11.2d[0] + + ldp q6, q7, [pA], #32 + + fmul v19.2d, v3.2d, v8.2d[0] fmul v27.2d, v3.2d, v10.2d[0] - fmul v28.2d, v0.2d, v11.2d[0] - fmul v29.2d, v1.2d, v11.2d[0] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - fmul v30.2d, v2.2d, v11.2d[0] fmul v31.2d, v3.2d, v11.2d[0] + fmul v23.2d, v3.2d, v9.2d[0] - ld1 {v4.2d, v5.2d}, [pA] - add pA, pA, #32 - ld1 {v6.2d, v7.2d}, [pA] - add pA, pA, #32 - ldp d12, d13, [pB] - add pB, pB, #16 - ldp d14, d15, [pB] - add pB, pB, #16 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL8x4_M1 fmla v16.2d, v0.2d, v8.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v26.2d, v2.2d, v10.2d[0] - fmla v31.2d, v3.2d, v11.2d[0] - - ld1 {v4.2d}, [pA], #16 - fmla v20.2d, v0.2d, v9.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - - ld1 {v5.2d}, [pA], #16 - - fmla v30.2d, v2.2d, v11.2d[0] - fmla v27.2d, v3.2d, v10.2d[0] - ldp d12, d13, [pB] - add pB, pB, #16 + ldp q4, q5, [pA], #32 + fmla v24.2d, v0.2d, v10.2d[0] fmla v28.2d, v0.2d, v11.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - ldp d14, d15, [pB] - add pB, pB, #16 + ldp d12, d13, [pB], #16 - fmla v18.2d, v2.2d, v8.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] - ld1 {v6.2d}, [pA], #16 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - fmla v24.2d, v0.2d, v10.2d[0] + fmla v21.2d, v1.2d, v9.2d[0] fmla v29.2d, v1.2d, v11.2d[0] - ld1 {v7.2d}, [pA], #16 + ldp d14, d15, [pB], #16 + fmla v18.2d, v2.2d, v8.2d[0] fmla v22.2d, v2.2d, v9.2d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v26.2d, v2.2d, v10.2d[0] + fmla v30.2d, v2.2d, v11.2d[0] fmla v19.2d, v3.2d, v8.2d[0] + fmla v23.2d, v3.2d, v9.2d[0] + + ldp q6, q7, [pA], #32 - prfm PLDL1KEEP, [pA, #224] - prfm PLDL1KEEP, [pA, #224+64] + fmla v27.2d, v3.2d, v10.2d[0] + fmla v31.2d, v3.2d, v11.2d[0] .endm .macro KERNEL8x4_M2 fmla v16.2d, v4.2d, v12.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v26.2d, v6.2d, v14.2d[0] - fmla v31.2d, v7.2d, v15.2d[0] - - ld1 {v0.2d}, [pA], #16 - fmla v20.2d, v4.2d, v13.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v24.2d, v4.2d, v14.2d[0] + fmla v28.2d, v4.2d, v15.2d[0] - ld1 {v1.2d}, [pA], #16 + ldp q0, q1, [pA], #32 - fmla v30.2d, v6.2d, v15.2d[0] - fmla v27.2d, v7.2d, v14.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] - ldp d8, d9, [pB] - add pB, pB, #16 + ldp d8, d9, [pB], #16 - fmla v28.2d, v4.2d, v15.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] + fmla v29.2d, v5.2d, v15.2d[0] - ldp d10, d11, [pB] - add pB, pB, #16 + ldp d10, d11, [pB], #16 + fmla v18.2d, v6.2d, v12.2d[0] fmla v22.2d, v6.2d, v13.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] - ld1 {v2.2d}, [pA], #16 - - fmla v24.2d, v4.2d, v14.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - ld1 {v3.2d}, [pA], #16 + fmla v26.2d, v6.2d, v14.2d[0] + fmla v30.2d, v6.2d, v15.2d[0] - fmla v18.2d, v6.2d, v12.2d[0] + fmla v19.2d, v7.2d, v12.2d[0] fmla v23.2d, v7.2d, v13.2d[0] - prfm PLDL1KEEP, [pB, #640] + ldp q2, q3, [pA], #32 + + fmla v27.2d, v7.2d, v14.2d[0] + fmla v31.2d, v7.2d, v15.2d[0] .endm .macro KERNEL8x4_E fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v6.2d, v12.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v6.2d, v13.2d[0] - fmla v23.2d, v7.2d, v13.2d[0] fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v6.2d, v14.2d[0] - fmla v27.2d, v7.2d, v14.2d[0] fmla v28.2d, v4.2d, v15.2d[0] + + fmla v17.2d, v5.2d, v12.2d[0] + fmla v25.2d, v5.2d, v14.2d[0] + fmla v21.2d, v5.2d, v13.2d[0] fmla v29.2d, v5.2d, v15.2d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v18.2d, v6.2d, v12.2d[0] + fmla v22.2d, v6.2d, v13.2d[0] + fmla v26.2d, v6.2d, v14.2d[0] fmla v30.2d, v6.2d, v15.2d[0] + + fmla v19.2d, v7.2d, v12.2d[0] + fmla v23.2d, v7.2d, v13.2d[0] + fmla v27.2d, v7.2d, v14.2d[0] fmla v31.2d, v7.2d, v15.2d[0] .endm .macro KERNEL8x4_SUB - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 - ldp d8, d9, [pB] - add pB, pB, #16 - ldp d10, d11, [pB] - add pB, pB, #16 + ldp q0, q1, [pA], #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + ldp d8, d9, [pB], #16 + fmla v16.2d, v0.2d, v8.2d[0] fmla v20.2d, v0.2d, v9.2d[0] + + ldp d10, d11, [pB], #16 + + fmla v17.2d, v1.2d, v8.2d[0] fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v2.2d, v9.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] + + ldp q2, q3, [pA], #32 fmla v24.2d, v0.2d, v10.2d[0] + fmla v28.2d, v0.2d, v11.2d[0] + fmla v25.2d, v1.2d, v10.2d[0] + fmla v29.2d, v1.2d, v11.2d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v18.2d, v2.2d, v8.2d[0] + fmla v22.2d, v2.2d, v9.2d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + fmla v26.2d, v2.2d, v10.2d[0] + fmla v30.2d, v2.2d, v11.2d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v19.2d, v3.2d, v8.2d[0] fmla v27.2d, v3.2d, v10.2d[0] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v2.2d, v11.2d[0] fmla v31.2d, v3.2d, v11.2d[0] + fmla v23.2d, v3.2d, v9.2d[0] .endm .macro SAVE8x4 fmov alpha0, alpha - ld1 {v0.2d, v1.2d}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ldp q0, q1, [pCRow0] fmla v0.2d, v16.2d, alphaV0 fmla v1.2d, v17.2d, alphaV0 - st1 {v0.2d, v1.2d}, [pCRow0] + stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld1 {v2.2d, v3.2d}, [pCRow0] + ldp q2, q3, [pCRow0] fmla v2.2d, v18.2d, alphaV0 fmla v3.2d, v19.2d, alphaV0 - st1 {v2.2d, v3.2d}, [pCRow0] + stp q2, q3, [pCRow0] add pCRow0, pCRow0, #32 - ld1 {v4.2d, v5.2d}, [pCRow1] + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ldp q4, q5, [pCRow1] fmla v4.2d, v20.2d, alphaV0 fmla v5.2d, v21.2d, alphaV0 - st1 {v4.2d, v5.2d}, [pCRow1] + stp q4, q5, [pCRow1] add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - ld1 {v6.2d, v7.2d}, [pCRow1] + ldp q6, q7, [pCRow1] fmla v6.2d, v22.2d, alphaV0 fmla v7.2d, v23.2d, alphaV0 - st1 {v6.2d, v7.2d}, [pCRow1] + stp q6, q7, [pCRow1] add pCRow1, pCRow1, #32 - ld1 {v0.2d, v1.2d}, [pCRow2] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + ldp q0, q1, [pCRow2] fmla v0.2d, v24.2d, alphaV0 fmla v1.2d, v25.2d, alphaV0 - st1 {v0.2d, v1.2d}, [pCRow2] + stp q0, q1, [pCRow2] add pCRow2, pCRow2, #32 - ld1 {v2.2d, v3.2d}, [pCRow2] + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + ldp q2, q3, [pCRow2] fmla v2.2d, v26.2d, alphaV0 fmla v3.2d, v27.2d, alphaV0 - st1 {v2.2d, v3.2d}, [pCRow2] + stp q2, q3, [pCRow2] add pCRow2, pCRow2, #32 - ld1 {v4.2d, v5.2d}, [pCRow3] + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + ldp q4, q5, [pCRow3] fmla v4.2d, v28.2d, alphaV0 fmla v5.2d, v29.2d, alphaV0 - st1 {v4.2d, v5.2d}, [pCRow3] + stp q4, q5, [pCRow3] add pCRow3, pCRow3, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - ld1 {v6.2d, v7.2d}, [pCRow3] + ldp q6, q7, [pCRow3] fmla v6.2d, v30.2d, alphaV0 fmla v7.2d, v31.2d, alphaV0 - st1 {v6.2d, v7.2d}, [pCRow3] + stp q6, q7, [pCRow3] add pCRow3, pCRow3, #32 - - prfm PLDL2KEEP, [pCRow0, #128] - prfm PLDL2KEEP, [pCRow1, #128] - prfm PLDL2KEEP, [pCRow2, #128] - prfm PLDL2KEEP, [pCRow3, #128] .endm /******************************************************************************/ @@ -422,30 +433,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 + fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 - fmla v9.2d, v17.2d, alphaV1 + fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d, v13.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV2 - fmla v13.2d, v21.2d, alphaV3 + fmla v12.2d, v20.2d, alphaV0 + fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2d, v9.2d}, [pCRow2] fmla v8.2d, v24.2d, alphaV0 - fmla v9.2d, v25.2d, alphaV1 + fmla v9.2d, v25.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2d, v13.2d}, [pCRow1] - fmla v12.2d, v28.2d, alphaV2 - fmla v13.2d, v29.2d, alphaV3 + fmla v12.2d, v28.2d, alphaV0 + fmla v13.2d, v29.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -474,6 +486,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 + fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -481,19 +494,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow1, pCRow0, LDC ld1 {v12.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV1 + fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow2, pCRow1, LDC ld1 {v8.2d}, [pCRow2] - fmla v8.2d, v24.2d, alphaV2 + fmla v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] add pCRow1, pCRow2, LDC ld1 {v12.2d}, [pCRow1] - fmla v12.2d, v28.2d, alphaV3 + fmla v12.2d, v28.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -518,6 +531,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0, alpha add pCRow1, pCRow0, LDC ld1 {v8.d}[0], [pCRow0] @@ -531,7 +545,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v12.d}[0], [pCRow2] ld1 {v12.d}[1], [pCRow1] - fmla v12.2d, v20.2d, alphaV1 + fmla v12.2d, v20.2d, alphaV0 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] @@ -571,20 +585,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0, alpha add pCRow1, pCRow0, LDC ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 - fmla v1.2d, v17.2d, alphaV1 - fmla v2.2d, v18.2d, alphaV2 - fmla v3.2d, v19.2d, alphaV3 + fmla v1.2d, v17.2d, alphaV0 + fmla v2.2d, v18.2d, alphaV0 + fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0 - fmla v5.2d, v21.2d, alphaV1 - fmla v6.2d, v22.2d, alphaV2 - fmla v7.2d, v23.2d, alphaV3 + fmla v5.2d, v21.2d, alphaV0 + fmla v6.2d, v22.2d, alphaV0 + fmla v7.2d, v23.2d, alphaV0 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] add pCRow0, pCRow0, #64 @@ -612,16 +627,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 - fmla v9.2d, v17.2d, alphaV1 + fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2d, v13.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV2 - fmla v13.2d, v21.2d, alphaV3 + fmla v12.2d, v20.2d, alphaV0 + fmla v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -646,6 +662,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -653,7 +670,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow1 , pCRow0, LDC ld1 {v12.2d}, [pCRow1] - fmla v12.2d, v20.2d, alphaV1 + fmla v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -676,6 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0, alpha add pCRow1 , pCRow0, LDC ld1 {v8.d}[0], [pCRow0] @@ -713,11 +731,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0, alpha ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0 - fmla v1.2d, v17.2d, alphaV1 - fmla v2.2d, v18.2d, alphaV2 - fmla v3.2d, v19.2d, alphaV3 + fmla v1.2d, v17.2d, alphaV0 + fmla v2.2d, v18.2d, alphaV0 + fmla v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] add pCRow0, pCRow0, #64 @@ -743,9 +762,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0, alpha ld1 {v8.2d, v9.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 - fmla v9.2d, v17.2d, alphaV1 + fmla v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow0, pCRow0, #32 @@ -769,6 +789,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0, alpha ld1 {v8.2d}, [pCRow0] fmla v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -793,6 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha ldr d8, [pCRow0] fmadd d8, d16, alpha0, d8 str d8, [pCRow0] @@ -820,6 +842,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + fmov alpha, d0 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -838,6 +863,7 @@ dgemm_kernel_L4_BEGIN: add pCRow1, pCRow0, LDC add pCRow2, pCRow1, LDC add pCRow3, pCRow2, LDC + add pC, pCRow3, LDC mov pA, origPA // pA = start of A array @@ -849,6 +875,7 @@ dgemm_kernel_L4_M8_BEGIN: cmp counterI, #0 ble dgemm_kernel_L4_M4_BEGIN + .align 5 dgemm_kernel_L4_M8_20: mov pB, origPB @@ -868,8 +895,8 @@ dgemm_kernel_L4_M8_20: subs counterL, counterL, #2 // subtract 2 ble dgemm_kernel_L4_M8_22a - .align 5 + .align 5 dgemm_kernel_L4_M8_22: KERNEL8x4_M1 @@ -884,7 +911,7 @@ dgemm_kernel_L4_M8_22: subs counterL, counterL, #1 bgt dgemm_kernel_L4_M8_22 - + .align 5 dgemm_kernel_L4_M8_22a: KERNEL8x4_M1 @@ -898,6 +925,7 @@ dgemm_kernel_L4_M8_22a: b dgemm_kernel_L4_M8_44 + .align 5 dgemm_kernel_L4_M8_32: tst counterL, #1 @@ -923,6 +951,7 @@ dgemm_kernel_L4_M8_44: ands counterL , origK, #7 ble dgemm_kernel_L4_M8_100 + .align 5 dgemm_kernel_L4_M8_46: KERNEL8x4_SUB @@ -931,6 +960,9 @@ dgemm_kernel_L4_M8_46: bne dgemm_kernel_L4_M8_46 dgemm_kernel_L4_M8_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE8x4 From 278511ad2d1727a7ed74c38e3664b5e51b04adc6 Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 24 Mar 2016 10:31:28 +0530 Subject: [PATCH 17/48] Cortex-A57: Fix clang compilation errors --- kernel/arm64/cgemm_kernel_4x4.S | 522 ++++++++++----------- kernel/arm64/cgemm_kernel_8x4.S | 750 +++++++++++++++---------------- kernel/arm64/ctrmm_kernel_4x4.S | 304 ++++++------- kernel/arm64/ctrmm_kernel_8x4.S | 750 +++++++++++++++---------------- kernel/arm64/dgemm_kernel_4x4.S | 210 ++++----- kernel/arm64/dgemm_kernel_4x8.S | 314 ++++++------- kernel/arm64/dgemm_kernel_8x4.S | 228 +++++----- kernel/arm64/dtrmm_kernel_4x4.S | 108 ++--- kernel/arm64/dtrmm_kernel_4x8.S | 314 ++++++------- kernel/arm64/dtrmm_kernel_8x4.S | 258 +++++------ kernel/arm64/sgemm_kernel_16x4.S | 414 ++++++++--------- kernel/arm64/sgemm_kernel_4x4.S | 236 +++++----- kernel/arm64/sgemm_kernel_8x8.S | 472 +++++++++---------- kernel/arm64/strmm_kernel_16x4.S | 414 ++++++++--------- kernel/arm64/strmm_kernel_4x4.S | 108 ++--- kernel/arm64/strmm_kernel_8x8.S | 472 +++++++++---------- kernel/arm64/zgemm_kernel_4x4.S | 510 ++++++++++----------- kernel/arm64/ztrmm_kernel_4x4.S | 510 ++++++++++----------- 18 files changed, 3447 insertions(+), 3447 deletions(-) mode change 100755 => 100644 kernel/arm64/cgemm_kernel_8x4.S mode change 100755 => 100644 kernel/arm64/ctrmm_kernel_8x4.S mode change 100755 => 100644 kernel/arm64/dgemm_kernel_4x8.S mode change 100755 => 100644 kernel/arm64/dgemm_kernel_8x4.S mode change 100755 => 100644 kernel/arm64/dtrmm_kernel_4x8.S mode change 100755 => 100644 kernel/arm64/dtrmm_kernel_8x4.S mode change 100755 => 100644 kernel/arm64/strmm_kernel_16x4.S mode change 100755 => 100644 kernel/arm64/strmm_kernel_8x8.S diff --git a/kernel/arm64/cgemm_kernel_4x4.S b/kernel/arm64/cgemm_kernel_4x4.S index 7a70264ca..7f2ddea07 100644 --- a/kernel/arm64/cgemm_kernel_4x4.S +++ b/kernel/arm64/cgemm_kernel_4x4.S @@ -179,93 +179,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [ppA] add ppA, ppA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - fmul v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] + fmul v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b - fmls v19.4s, v2.4s, v9.4s[0] + fmls v19.4s, v2.4s, v9.s[0] #else - fmul v19.4s, v2.4s, v9.4s[0] + fmul v19.4s, v2.4s, v9.s[0] #endif - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - fmul v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] + fmul v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b - fmls v23.4s, v2.4s, v9.4s[1] + fmls v23.4s, v2.4s, v9.s[1] #else - fmul v23.4s, v2.4s, v9.4s[1] + fmul v23.4s, v2.4s, v9.s[1] #endif - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - fmul v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] + fmul v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.4s[2] + fmls v27.4s, v2.4s, v9.s[2] #else - fmul v27.4s, v2.4s, v9.4s[2] + fmul v27.4s, v2.4s, v9.s[2] #endif - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - fmul v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] + fmul v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.4s[3] + fmls v31.4s, v2.4s, v9.s[3] #else - fmul v31.4s, v2.4s, v9.4s[3] + fmul v31.4s, v2.4s, v9.s[3] #endif - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -276,159 +276,159 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // for next round add pB, pB, #32 - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] ld2 {v4.4s, v5.4s} , [pA] // for next round add pA, pA, #32 - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] prfm PLDL1KEEP, [pA, #512] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] ld2 {v6.4s, v7.4s} , [ppA] // for next round add ppA, ppA, #32 - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] prfm PLDL1KEEP, [ppA, #512] - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] .endm .macro KERNEL8x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // for next round add pB, pB, #32 - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] ld2 {v0.4s, v1.4s}, [pA] // for next round add pA, pA, #32 - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] prfm PLDL1KEEP, [ppA, #512] - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] ld2 {v2.4s, v3.4s}, [ppA] // for next round add ppA, ppA, #32 - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] prfm PLDL1KEEP, [pB, #512] - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] .endm .macro KERNEL8x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] - - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] - - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] - - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] - - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] - - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] - - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] - - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] + + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] + + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] + + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] + + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] + + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] + + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] + + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] .endm .macro KERNEL8x4_SUB @@ -437,48 +437,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v2.4s, v3.4s}, [ppA] add ppA, ppA, #32 - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] - - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] - - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] - - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] - - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] - - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] + + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] + + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] + + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] + + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] + + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] .endm .macro SAVE8x4 @@ -578,25 +578,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] - - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] - - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] - - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] + + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] + + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] + + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 @@ -658,25 +658,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.4s[0] - OP_ii v16.2s, v1.2s, v9.4s[0] - OP_ri v17.2s, v0.2s, v9.4s[0] - OP_ir v17.2s, v1.2s, v8.4s[0] - - OP_rr v20.2s, v0.2s, v8.4s[1] - OP_ii v20.2s, v1.2s, v9.4s[1] - OP_ri v21.2s, v0.2s, v9.4s[1] - OP_ir v21.2s, v1.2s, v8.4s[1] - - OP_rr v24.2s, v0.2s, v8.4s[2] - OP_ii v24.2s, v1.2s, v9.4s[2] - OP_ri v25.2s, v0.2s, v9.4s[2] - OP_ir v25.2s, v1.2s, v8.4s[2] - - OP_rr v28.2s, v0.2s, v8.4s[3] - OP_ii v28.2s, v1.2s, v9.4s[3] - OP_ri v29.2s, v0.2s, v9.4s[3] - OP_ir v29.2s, v1.2s, v8.4s[3] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] + + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] + + OP_rr v24.2s, v0.2s, v8.s[2] + OP_ii v24.2s, v1.2s, v9.s[2] + OP_ri v25.2s, v0.2s, v9.s[2] + OP_ir v25.2s, v1.2s, v8.s[2] + + OP_rr v28.2s, v0.2s, v8.s[3] + OP_ii v28.2s, v1.2s, v9.s[3] + OP_ri v29.2s, v0.2s, v9.s[3] + OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 @@ -738,25 +738,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.4s[0] - OP_ii s16, s1, v9.4s[0] - OP_ri s17, s0, v9.4s[0] - OP_ir s17, s1, v8.4s[0] - - OP_rr s20, s0, v8.4s[1] - OP_ii s20, s1, v9.4s[1] - OP_ri s21, s0, v9.4s[1] - OP_ir s21, s1, v8.4s[1] - - OP_rr s24, s0, v8.4s[2] - OP_ii s24, s1, v9.4s[2] - OP_ri s25, s0, v9.4s[2] - OP_ir s25, s1, v8.4s[2] - - OP_rr s28, s0, v8.4s[3] - OP_ii s28, s1, v9.4s[3] - OP_ri s29, s0, v9.4s[3] - OP_ir s29, s1, v8.4s[3] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] + + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] + + OP_rr s24, s0, v8.s[2] + OP_ii s24, s1, v9.s[2] + OP_ri s25, s0, v9.s[2] + OP_ir s25, s1, v8.s[2] + + OP_rr s28, s0, v8.s[3] + OP_ii s28, s1, v9.s[3] + OP_ri s29, s0, v9.s[3] + OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 @@ -814,15 +814,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 @@ -862,15 +862,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.2s[0] - OP_ii v16.2s, v1.2s, v9.2s[0] - OP_ri v17.2s, v0.2s, v9.2s[0] - OP_ir v17.2s, v1.2s, v8.2s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.2s[1] - OP_ii v20.2s, v1.2s, v9.2s[1] - OP_ri v21.2s, v0.2s, v9.2s[1] - OP_ir v21.2s, v1.2s, v8.2s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 @@ -910,15 +910,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.2s[0] - OP_ii s16, s1, v9.2s[0] - OP_ri s17, s0, v9.2s[0] - OP_ir s17, s1, v8.2s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.2s[1] - OP_ii s20, s1, v9.2s[1] - OP_ri s21, s0, v9.2s[1] - OP_ir s21, s1, v8.2s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S old mode 100755 new mode 100644 index 40b98cee2..d58cef52d --- a/kernel/arm64/cgemm_kernel_8x4.S +++ b/kernel/arm64/cgemm_kernel_8x4.S @@ -178,93 +178,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] + fmul v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b - fmls v19.4s, v2.4s, v9.4s[0] + fmls v19.4s, v2.4s, v9.s[0] #else - fmul v19.4s, v2.4s, v9.4s[0] + fmul v19.4s, v2.4s, v9.s[0] #endif - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] + fmul v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b - fmls v23.4s, v2.4s, v9.4s[1] + fmls v23.4s, v2.4s, v9.s[1] #else - fmul v23.4s, v2.4s, v9.4s[1] + fmul v23.4s, v2.4s, v9.s[1] #endif - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] + fmul v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.4s[2] + fmls v27.4s, v2.4s, v9.s[2] #else - fmul v27.4s, v2.4s, v9.4s[2] + fmul v27.4s, v2.4s, v9.s[2] #endif - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - fmul v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] + fmul v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.4s[3] + fmls v31.4s, v2.4s, v9.s[3] #else - fmul v31.4s, v2.4s, v9.4s[3] + fmul v31.4s, v2.4s, v9.s[3] #endif - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -275,45 +275,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] - - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] - - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] - - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] - - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] - - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] - - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] - - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] + + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] + + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] + + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] + + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] + + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] + + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] + + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 @@ -324,45 +324,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] - - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] - - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] - - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] - - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] - - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] - - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] - - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] + + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] + + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] + + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] + + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] + + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] + + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] + + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 @@ -373,45 +373,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] - - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] - - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] - - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] - - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] - - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] - - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] - - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] + + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] + + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] + + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] + + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] + + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] + + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] + + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] .endm @@ -423,45 +423,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] - - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] - - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] - - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] - - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] - - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] - - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] - - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] + + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] + + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] + + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] + + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] + + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] + + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] + + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] .endm @@ -560,49 +560,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -611,85 +611,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v4.4s, v5.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] prfm PLDL1KEEP, [pA, #512] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL4x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] ld2 {v0.4s, v1.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] - - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] - - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] - - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] + + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] + + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] + + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_SUB @@ -698,25 +698,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] - - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] - - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] - - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] + + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] + + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] + + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 @@ -778,25 +778,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.4s[0] - OP_ii v16.2s, v1.2s, v9.4s[0] - OP_ri v17.2s, v0.2s, v9.4s[0] - OP_ir v17.2s, v1.2s, v8.4s[0] - - OP_rr v20.2s, v0.2s, v8.4s[1] - OP_ii v20.2s, v1.2s, v9.4s[1] - OP_ri v21.2s, v0.2s, v9.4s[1] - OP_ir v21.2s, v1.2s, v8.4s[1] - - OP_rr v24.2s, v0.2s, v8.4s[2] - OP_ii v24.2s, v1.2s, v9.4s[2] - OP_ri v25.2s, v0.2s, v9.4s[2] - OP_ir v25.2s, v1.2s, v8.4s[2] - - OP_rr v28.2s, v0.2s, v8.4s[3] - OP_ii v28.2s, v1.2s, v9.4s[3] - OP_ri v29.2s, v0.2s, v9.4s[3] - OP_ir v29.2s, v1.2s, v8.4s[3] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] + + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] + + OP_rr v24.2s, v0.2s, v8.s[2] + OP_ii v24.2s, v1.2s, v9.s[2] + OP_ri v25.2s, v0.2s, v9.s[2] + OP_ir v25.2s, v1.2s, v8.s[2] + + OP_rr v28.2s, v0.2s, v8.s[3] + OP_ii v28.2s, v1.2s, v9.s[3] + OP_ri v29.2s, v0.2s, v9.s[3] + OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 @@ -858,25 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.4s[0] - OP_ii s16, s1, v9.4s[0] - OP_ri s17, s0, v9.4s[0] - OP_ir s17, s1, v8.4s[0] - - OP_rr s20, s0, v8.4s[1] - OP_ii s20, s1, v9.4s[1] - OP_ri s21, s0, v9.4s[1] - OP_ir s21, s1, v8.4s[1] - - OP_rr s24, s0, v8.4s[2] - OP_ii s24, s1, v9.4s[2] - OP_ri s25, s0, v9.4s[2] - OP_ir s25, s1, v8.4s[2] - - OP_rr s28, s0, v8.4s[3] - OP_ii s28, s1, v9.4s[3] - OP_ri s29, s0, v9.4s[3] - OP_ir s29, s1, v8.4s[3] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] + + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] + + OP_rr s24, s0, v8.s[2] + OP_ii s24, s1, v9.s[2] + OP_ri s25, s0, v9.s[2] + OP_ir s25, s1, v8.s[2] + + OP_rr s28, s0, v8.s[3] + OP_ii s28, s1, v9.s[3] + OP_ri s29, s0, v9.s[3] + OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 @@ -940,25 +940,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] - - OP_rr v18.4s, v2.4s, v8.2s[0] - OP_ii v18.4s, v3.4s, v9.2s[0] - OP_ri v19.4s, v2.4s, v9.2s[0] - OP_ir v19.4s, v3.4s, v8.2s[0] - - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] - - OP_rr v22.4s, v2.4s, v8.2s[1] - OP_ii v22.4s, v3.4s, v9.2s[1] - OP_ri v23.4s, v2.4s, v9.2s[1] - OP_ir v23.4s, v3.4s, v8.2s[1] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] + + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] + + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] + + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] .endm .macro SAVE8x2 @@ -1016,15 +1016,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 @@ -1064,15 +1064,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.2s[0] - OP_ii v16.2s, v1.2s, v9.2s[0] - OP_ri v17.2s, v0.2s, v9.2s[0] - OP_ir v17.2s, v1.2s, v8.2s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.2s[1] - OP_ii v20.2s, v1.2s, v9.2s[1] - OP_ri v21.2s, v0.2s, v9.2s[1] - OP_ir v21.2s, v1.2s, v8.2s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 @@ -1112,15 +1112,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.2s[0] - OP_ii s16, s1, v9.2s[0] - OP_ri s17, s0, v9.2s[0] - OP_ir s17, s1, v8.2s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.2s[1] - OP_ii s20, s1, v9.2s[1] - OP_ri s21, s0, v9.2s[1] - OP_ir s21, s1, v8.2s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 @@ -1162,15 +1162,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v8.4s[1] - OP_ri v17.4s, v0.4s, v8.4s[1] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v8.s[1] + OP_ri v17.4s, v0.4s, v8.s[1] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v8.4s[1] - OP_ri v19.4s, v2.4s, v8.4s[1] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v8.s[1] + OP_ri v19.4s, v2.4s, v8.s[1] + OP_ir v19.4s, v3.4s, v8.s[0] .endm .macro SAVE8x1 diff --git a/kernel/arm64/ctrmm_kernel_4x4.S b/kernel/arm64/ctrmm_kernel_4x4.S index be0e9bdef..3de27257a 100644 --- a/kernel/arm64/ctrmm_kernel_4x4.S +++ b/kernel/arm64/ctrmm_kernel_4x4.S @@ -170,49 +170,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -221,85 +221,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v4.4s, v5.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] prfm PLDL1KEEP, [pA, #512] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL4x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] ld2 {v0.4s, v1.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] - - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] - - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] - - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] + + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] + + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] + + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_SUB @@ -308,25 +308,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] - - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] - - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] - - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] + + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] + + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] + + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 @@ -384,25 +384,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.4s[0] - OP_ii v16.2s, v1.2s, v9.4s[0] - OP_ri v17.2s, v0.2s, v9.4s[0] - OP_ir v17.2s, v1.2s, v8.4s[0] - - OP_rr v20.2s, v0.2s, v8.4s[1] - OP_ii v20.2s, v1.2s, v9.4s[1] - OP_ri v21.2s, v0.2s, v9.4s[1] - OP_ir v21.2s, v1.2s, v8.4s[1] - - OP_rr v24.2s, v0.2s, v8.4s[2] - OP_ii v24.2s, v1.2s, v9.4s[2] - OP_ri v25.2s, v0.2s, v9.4s[2] - OP_ir v25.2s, v1.2s, v8.4s[2] - - OP_rr v28.2s, v0.2s, v8.4s[3] - OP_ii v28.2s, v1.2s, v9.4s[3] - OP_ri v29.2s, v0.2s, v9.4s[3] - OP_ir v29.2s, v1.2s, v8.4s[3] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] + + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] + + OP_rr v24.2s, v0.2s, v8.s[2] + OP_ii v24.2s, v1.2s, v9.s[2] + OP_ri v25.2s, v0.2s, v9.s[2] + OP_ir v25.2s, v1.2s, v8.s[2] + + OP_rr v28.2s, v0.2s, v8.s[3] + OP_ii v28.2s, v1.2s, v9.s[3] + OP_ri v29.2s, v0.2s, v9.s[3] + OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 @@ -460,25 +460,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.4s[0] - OP_ii s16, s1, v9.4s[0] - OP_ri s17, s0, v9.4s[0] - OP_ir s17, s1, v8.4s[0] - - OP_rr s20, s0, v8.4s[1] - OP_ii s20, s1, v9.4s[1] - OP_ri s21, s0, v9.4s[1] - OP_ir s21, s1, v8.4s[1] - - OP_rr s24, s0, v8.4s[2] - OP_ii s24, s1, v9.4s[2] - OP_ri s25, s0, v9.4s[2] - OP_ir s25, s1, v8.4s[2] - - OP_rr s28, s0, v8.4s[3] - OP_ii s28, s1, v9.4s[3] - OP_ri s29, s0, v9.4s[3] - OP_ir s29, s1, v8.4s[3] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] + + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] + + OP_rr s24, s0, v8.s[2] + OP_ii s24, s1, v9.s[2] + OP_ri s25, s0, v9.s[2] + OP_ir s25, s1, v8.s[2] + + OP_rr s28, s0, v8.s[3] + OP_ii s28, s1, v9.s[3] + OP_ri s29, s0, v9.s[3] + OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 @@ -532,15 +532,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 @@ -578,15 +578,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.2s[0] - OP_ii v16.2s, v1.2s, v9.2s[0] - OP_ri v17.2s, v0.2s, v9.2s[0] - OP_ir v17.2s, v1.2s, v8.2s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.2s[1] - OP_ii v20.2s, v1.2s, v9.2s[1] - OP_ri v21.2s, v0.2s, v9.2s[1] - OP_ir v21.2s, v1.2s, v8.2s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 @@ -624,15 +624,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.2s[0] - OP_ii s16, s1, v9.2s[0] - OP_ri s17, s0, v9.2s[0] - OP_ir s17, s1, v8.2s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.2s[1] - OP_ii s20, s1, v9.2s[1] - OP_ri s21, s0, v9.2s[1] - OP_ir s21, s1, v8.2s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S old mode 100755 new mode 100644 index 3131541d4..ce5cb0406 --- a/kernel/arm64/ctrmm_kernel_8x4.S +++ b/kernel/arm64/ctrmm_kernel_8x4.S @@ -180,93 +180,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] + fmul v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b - fmls v19.4s, v2.4s, v9.4s[0] + fmls v19.4s, v2.4s, v9.s[0] #else - fmul v19.4s, v2.4s, v9.4s[0] + fmul v19.4s, v2.4s, v9.s[0] #endif - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_ir v19.4s, v3.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] + fmul v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b - fmls v23.4s, v2.4s, v9.4s[1] + fmls v23.4s, v2.4s, v9.s[1] #else - fmul v23.4s, v2.4s, v9.4s[1] + fmul v23.4s, v2.4s, v9.s[1] #endif - OP_ir v23.4s, v3.4s, v8.4s[1] + OP_ir v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] + fmul v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.4s[2] + fmls v27.4s, v2.4s, v9.s[2] #else - fmul v27.4s, v2.4s, v9.4s[2] + fmul v27.4s, v2.4s, v9.s[2] #endif - OP_ir v27.4s, v3.4s, v8.4s[2] + OP_ir v27.4s, v3.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] - fmul v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] + fmul v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.4s[3] + fmls v31.4s, v2.4s, v9.s[3] #else - fmul v31.4s, v2.4s, v9.4s[3] + fmul v31.4s, v2.4s, v9.s[3] #endif - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -277,45 +277,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] - - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] - - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] - - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] - - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] - - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] - - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] - - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] + + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] + + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] + + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] + + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] + + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] + + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] + + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 @@ -326,45 +326,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] - - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] - - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] - - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] - - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] - - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] - - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] - - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] + + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] + + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] + + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] + + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] + + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] + + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] + + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] ld2 {v8.4s, v9.4s}, [pB] add pB, pB, #32 @@ -375,45 +375,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] - - OP_rr v18.4s, v6.4s, v12.4s[0] - OP_ii v18.4s, v7.4s, v13.4s[0] - OP_ri v19.4s, v6.4s, v13.4s[0] - OP_ir v19.4s, v7.4s, v12.4s[0] - - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] - - OP_rr v22.4s, v6.4s, v12.4s[1] - OP_ii v22.4s, v7.4s, v13.4s[1] - OP_ri v23.4s, v6.4s, v13.4s[1] - OP_ir v23.4s, v7.4s, v12.4s[1] - - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] - - OP_rr v26.4s, v6.4s, v12.4s[2] - OP_ii v26.4s, v7.4s, v13.4s[2] - OP_ri v27.4s, v6.4s, v13.4s[2] - OP_ir v27.4s, v7.4s, v12.4s[2] - - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] - - OP_rr v30.4s, v6.4s, v12.4s[3] - OP_ii v30.4s, v7.4s, v13.4s[3] - OP_ri v31.4s, v6.4s, v13.4s[3] - OP_ir v31.4s, v7.4s, v12.4s[3] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] + + OP_rr v18.4s, v6.4s, v12.s[0] + OP_ii v18.4s, v7.4s, v13.s[0] + OP_ri v19.4s, v6.4s, v13.s[0] + OP_ir v19.4s, v7.4s, v12.s[0] + + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] + + OP_rr v22.4s, v6.4s, v12.s[1] + OP_ii v22.4s, v7.4s, v13.s[1] + OP_ri v23.4s, v6.4s, v13.s[1] + OP_ir v23.4s, v7.4s, v12.s[1] + + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] + + OP_rr v26.4s, v6.4s, v12.s[2] + OP_ii v26.4s, v7.4s, v13.s[2] + OP_ri v27.4s, v6.4s, v13.s[2] + OP_ir v27.4s, v7.4s, v12.s[2] + + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] + + OP_rr v30.4s, v6.4s, v12.s[3] + OP_ii v30.4s, v7.4s, v13.s[3] + OP_ri v31.4s, v6.4s, v13.s[3] + OP_ir v31.4s, v7.4s, v12.s[3] .endm @@ -425,45 +425,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] - - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v9.4s[0] - OP_ri v19.4s, v2.4s, v9.4s[0] - OP_ir v19.4s, v3.4s, v8.4s[0] - - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] - - OP_rr v22.4s, v2.4s, v8.4s[1] - OP_ii v22.4s, v3.4s, v9.4s[1] - OP_ri v23.4s, v2.4s, v9.4s[1] - OP_ir v23.4s, v3.4s, v8.4s[1] - - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] - - OP_rr v26.4s, v2.4s, v8.4s[2] - OP_ii v26.4s, v3.4s, v9.4s[2] - OP_ri v27.4s, v2.4s, v9.4s[2] - OP_ir v27.4s, v3.4s, v8.4s[2] - - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] - - OP_rr v30.4s, v2.4s, v8.4s[3] - OP_ii v30.4s, v3.4s, v9.4s[3] - OP_ri v31.4s, v2.4s, v9.4s[3] - OP_ir v31.4s, v3.4s, v8.4s[3] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] + + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] + + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] + + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] + + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] + + OP_rr v26.4s, v2.4s, v8.s[2] + OP_ii v26.4s, v3.4s, v9.s[2] + OP_ri v27.4s, v2.4s, v9.s[2] + OP_ir v27.4s, v3.4s, v8.s[2] + + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] + + OP_rr v30.4s, v2.4s, v8.s[3] + OP_ii v30.4s, v3.4s, v9.s[3] + OP_ri v31.4s, v2.4s, v9.s[3] + OP_ir v31.4s, v3.4s, v8.s[3] .endm @@ -562,49 +562,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - fmul v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.4s, v0.4s, v9.4s[0] + fmls v17.4s, v0.4s, v9.s[0] #else - fmul v17.4s, v0.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.s[0] #endif - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.4s, v0.4s, v9.4s[1] + fmls v21.4s, v0.4s, v9.s[1] #else - fmul v21.4s, v0.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.s[1] #endif - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_ir v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.4s[2] + fmls v25.4s, v0.4s, v9.s[2] #else - fmul v25.4s, v0.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.s[2] #endif - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_ir v25.4s, v1.4s, v8.s[2] - fmul v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.4s[3] + fmls v29.4s, v0.4s, v9.s[3] #else - fmul v29.4s, v0.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.s[3] #endif - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_ir v29.4s, v1.4s, v8.s[3] ld2 {v12.4s, v13.4s}, [pB] add pB, pB, #32 @@ -613,85 +613,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] ld2 {v12.4s, v13.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] ld2 {v4.4s, v5.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] prfm PLDL1KEEP, [pA, #512] - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro KERNEL4x4_M2 - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] ld2 {v8.4s, v9.4s}, [pB] // For next round add pB, pB, #32 - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] ld2 {v0.4s, v1.4s}, [pA] // For next round add pA, pA, #32 - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_E - OP_rr v16.4s, v4.4s, v12.4s[0] - OP_ii v16.4s, v5.4s, v13.4s[0] - OP_ri v17.4s, v4.4s, v13.4s[0] - OP_ir v17.4s, v5.4s, v12.4s[0] - - OP_rr v20.4s, v4.4s, v12.4s[1] - OP_ii v20.4s, v5.4s, v13.4s[1] - OP_ri v21.4s, v4.4s, v13.4s[1] - OP_ir v21.4s, v5.4s, v12.4s[1] - - OP_rr v24.4s, v4.4s, v12.4s[2] - OP_ii v24.4s, v5.4s, v13.4s[2] - OP_ri v25.4s, v4.4s, v13.4s[2] - OP_ir v25.4s, v5.4s, v12.4s[2] - - OP_rr v28.4s, v4.4s, v12.4s[3] - OP_ii v28.4s, v5.4s, v13.4s[3] - OP_ri v29.4s, v4.4s, v13.4s[3] - OP_ir v29.4s, v5.4s, v12.4s[3] + OP_rr v16.4s, v4.4s, v12.s[0] + OP_ii v16.4s, v5.4s, v13.s[0] + OP_ri v17.4s, v4.4s, v13.s[0] + OP_ir v17.4s, v5.4s, v12.s[0] + + OP_rr v20.4s, v4.4s, v12.s[1] + OP_ii v20.4s, v5.4s, v13.s[1] + OP_ri v21.4s, v4.4s, v13.s[1] + OP_ir v21.4s, v5.4s, v12.s[1] + + OP_rr v24.4s, v4.4s, v12.s[2] + OP_ii v24.4s, v5.4s, v13.s[2] + OP_ri v25.4s, v4.4s, v13.s[2] + OP_ir v25.4s, v5.4s, v12.s[2] + + OP_rr v28.4s, v4.4s, v12.s[3] + OP_ii v28.4s, v5.4s, v13.s[3] + OP_ri v29.4s, v4.4s, v13.s[3] + OP_ir v29.4s, v5.4s, v12.s[3] .endm .macro KERNEL4x4_SUB @@ -700,25 +700,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v9.4s[0] - OP_ri v17.4s, v0.4s, v9.4s[0] - OP_ir v17.4s, v1.4s, v8.4s[0] - - OP_rr v20.4s, v0.4s, v8.4s[1] - OP_ii v20.4s, v1.4s, v9.4s[1] - OP_ri v21.4s, v0.4s, v9.4s[1] - OP_ir v21.4s, v1.4s, v8.4s[1] - - OP_rr v24.4s, v0.4s, v8.4s[2] - OP_ii v24.4s, v1.4s, v9.4s[2] - OP_ri v25.4s, v0.4s, v9.4s[2] - OP_ir v25.4s, v1.4s, v8.4s[2] - - OP_rr v28.4s, v0.4s, v8.4s[3] - OP_ii v28.4s, v1.4s, v9.4s[3] - OP_ri v29.4s, v0.4s, v9.4s[3] - OP_ir v29.4s, v1.4s, v8.4s[3] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] + + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] + + OP_rr v24.4s, v0.4s, v8.s[2] + OP_ii v24.4s, v1.4s, v9.s[2] + OP_ri v25.4s, v0.4s, v9.s[2] + OP_ir v25.4s, v1.4s, v8.s[2] + + OP_rr v28.4s, v0.4s, v8.s[3] + OP_ii v28.4s, v1.4s, v9.s[3] + OP_ri v29.4s, v0.4s, v9.s[3] + OP_ir v29.4s, v1.4s, v8.s[3] .endm .macro SAVE4x4 @@ -780,25 +780,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.4s[0] - OP_ii v16.2s, v1.2s, v9.4s[0] - OP_ri v17.2s, v0.2s, v9.4s[0] - OP_ir v17.2s, v1.2s, v8.4s[0] - - OP_rr v20.2s, v0.2s, v8.4s[1] - OP_ii v20.2s, v1.2s, v9.4s[1] - OP_ri v21.2s, v0.2s, v9.4s[1] - OP_ir v21.2s, v1.2s, v8.4s[1] - - OP_rr v24.2s, v0.2s, v8.4s[2] - OP_ii v24.2s, v1.2s, v9.4s[2] - OP_ri v25.2s, v0.2s, v9.4s[2] - OP_ir v25.2s, v1.2s, v8.4s[2] - - OP_rr v28.2s, v0.2s, v8.4s[3] - OP_ii v28.2s, v1.2s, v9.4s[3] - OP_ri v29.2s, v0.2s, v9.4s[3] - OP_ir v29.2s, v1.2s, v8.4s[3] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] + + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] + + OP_rr v24.2s, v0.2s, v8.s[2] + OP_ii v24.2s, v1.2s, v9.s[2] + OP_ri v25.2s, v0.2s, v9.s[2] + OP_ir v25.2s, v1.2s, v8.s[2] + + OP_rr v28.2s, v0.2s, v8.s[3] + OP_ii v28.2s, v1.2s, v9.s[3] + OP_ri v29.2s, v0.2s, v9.s[3] + OP_ir v29.2s, v1.2s, v8.s[3] .endm .macro SAVE2x4 @@ -860,25 +860,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.4s[0] - OP_ii s16, s1, v9.4s[0] - OP_ri s17, s0, v9.4s[0] - OP_ir s17, s1, v8.4s[0] - - OP_rr s20, s0, v8.4s[1] - OP_ii s20, s1, v9.4s[1] - OP_ri s21, s0, v9.4s[1] - OP_ir s21, s1, v8.4s[1] - - OP_rr s24, s0, v8.4s[2] - OP_ii s24, s1, v9.4s[2] - OP_ri s25, s0, v9.4s[2] - OP_ir s25, s1, v8.4s[2] - - OP_rr s28, s0, v8.4s[3] - OP_ii s28, s1, v9.4s[3] - OP_ri s29, s0, v9.4s[3] - OP_ir s29, s1, v8.4s[3] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] + + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] + + OP_rr s24, s0, v8.s[2] + OP_ii s24, s1, v9.s[2] + OP_ri s25, s0, v9.s[2] + OP_ir s25, s1, v8.s[2] + + OP_rr s28, s0, v8.s[3] + OP_ii s28, s1, v9.s[3] + OP_ri s29, s0, v9.s[3] + OP_ir s29, s1, v8.s[3] .endm .macro SAVE1x4 @@ -942,25 +942,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] - - OP_rr v18.4s, v2.4s, v8.2s[0] - OP_ii v18.4s, v3.4s, v9.2s[0] - OP_ri v19.4s, v2.4s, v9.2s[0] - OP_ir v19.4s, v3.4s, v8.2s[0] - - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] - - OP_rr v22.4s, v2.4s, v8.2s[1] - OP_ii v22.4s, v3.4s, v9.2s[1] - OP_ri v23.4s, v2.4s, v9.2s[1] - OP_ir v23.4s, v3.4s, v8.2s[1] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] + + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] + + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] + + OP_rr v22.4s, v2.4s, v8.s[1] + OP_ii v22.4s, v3.4s, v9.s[1] + OP_ri v23.4s, v2.4s, v9.s[1] + OP_ir v23.4s, v3.4s, v8.s[1] .endm .macro SAVE8x2 @@ -1018,15 +1018,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.2s[0] - OP_ii v16.4s, v1.4s, v9.2s[0] - OP_ri v17.4s, v0.4s, v9.2s[0] - OP_ir v17.4s, v1.4s, v8.2s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v20.4s, v0.4s, v8.2s[1] - OP_ii v20.4s, v1.4s, v9.2s[1] - OP_ri v21.4s, v0.4s, v9.2s[1] - OP_ir v21.4s, v1.4s, v8.2s[1] + OP_rr v20.4s, v0.4s, v8.s[1] + OP_ii v20.4s, v1.4s, v9.s[1] + OP_ri v21.4s, v0.4s, v9.s[1] + OP_ir v21.4s, v1.4s, v8.s[1] .endm .macro SAVE4x2 @@ -1066,15 +1066,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - OP_rr v16.2s, v0.2s, v8.2s[0] - OP_ii v16.2s, v1.2s, v9.2s[0] - OP_ri v17.2s, v0.2s, v9.2s[0] - OP_ir v17.2s, v1.2s, v8.2s[0] + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] - OP_rr v20.2s, v0.2s, v8.2s[1] - OP_ii v20.2s, v1.2s, v9.2s[1] - OP_ri v21.2s, v0.2s, v9.2s[1] - OP_ir v21.2s, v1.2s, v8.2s[1] + OP_rr v20.2s, v0.2s, v8.s[1] + OP_ii v20.2s, v1.2s, v9.s[1] + OP_ri v21.2s, v0.2s, v9.s[1] + OP_ir v21.2s, v1.2s, v8.s[1] .endm .macro SAVE2x2 @@ -1114,15 +1114,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pA] add pA, pA, #8 - OP_rr s16, s0, v8.2s[0] - OP_ii s16, s1, v9.2s[0] - OP_ri s17, s0, v9.2s[0] - OP_ir s17, s1, v8.2s[0] + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] - OP_rr s20, s0, v8.2s[1] - OP_ii s20, s1, v9.2s[1] - OP_ri s21, s0, v9.2s[1] - OP_ir s21, s1, v8.2s[1] + OP_rr s20, s0, v8.s[1] + OP_ii s20, s1, v9.s[1] + OP_ri s21, s0, v9.s[1] + OP_ir s21, s1, v8.s[1] .endm .macro SAVE1x2 @@ -1164,15 +1164,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pA] add pA, pA, #32 - OP_rr v16.4s, v0.4s, v8.4s[0] - OP_ii v16.4s, v1.4s, v8.4s[1] - OP_ri v17.4s, v0.4s, v8.4s[1] - OP_ir v17.4s, v1.4s, v8.4s[0] + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v8.s[1] + OP_ri v17.4s, v0.4s, v8.s[1] + OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.4s[0] - OP_ii v18.4s, v3.4s, v8.4s[1] - OP_ri v19.4s, v2.4s, v8.4s[1] - OP_ir v19.4s, v3.4s, v8.4s[0] + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v8.s[1] + OP_ri v19.4s, v2.4s, v8.s[1] + OP_ir v19.4s, v3.4s, v8.s[0] .endm .macro SAVE8x1 diff --git a/kernel/arm64/dgemm_kernel_4x4.S b/kernel/arm64/dgemm_kernel_4x4.S index e2ad11492..44b0f7ff2 100644 --- a/kernel/arm64/dgemm_kernel_4x4.S +++ b/kernel/arm64/dgemm_kernel_4x4.S @@ -161,150 +161,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldp q0, q1, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v29.2d, v1.2d, v11.2d[0] + fmul v16.2d, v0.2d, v8.d[0] + fmul v29.2d, v1.2d, v11.d[0] ldp q2, q3, [ppA] add ppA, ppA, #32 - fmul v20.2d, v0.2d, v9.2d[0] - fmul v25.2d, v1.2d, v10.2d[0] + fmul v20.2d, v0.2d, v9.d[0] + fmul v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - fmul v18.2d, v2.2d, v8.2d[0] - fmul v31.2d, v3.2d, v11.2d[0] + fmul v18.2d, v2.2d, v8.d[0] + fmul v31.2d, v3.2d, v11.d[0] prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] - fmul v22.2d, v2.2d, v9.2d[0] - fmul v27.2d, v3.2d, v10.2d[0] + fmul v22.2d, v2.2d, v9.d[0] + fmul v27.2d, v3.2d, v10.d[0] ldp d12, d13, [pB] add pB, pB, #16 - fmul v24.2d, v0.2d, v10.2d[0] - fmul v21.2d, v1.2d, v9.2d[0] + fmul v24.2d, v0.2d, v10.d[0] + fmul v21.2d, v1.2d, v9.d[0] ldp q4, q5, [pA] // for next round add pA, pA, #32 - fmul v26.2d, v2.2d, v10.2d[0] - fmul v23.2d, v3.2d, v9.2d[0] + fmul v26.2d, v2.2d, v10.d[0] + fmul v23.2d, v3.2d, v9.d[0] ldp q6, q7, [ppA] // for next round add ppA, ppA, #32 - fmul v28.2d, v0.2d, v11.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] + fmul v28.2d, v0.2d, v11.d[0] + fmul v17.2d, v1.2d, v8.d[0] ldp d14, d15, [pB] add pB, pB, #16 - fmul v30.2d, v2.2d, v11.2d[0] - fmul v19.2d, v3.2d, v8.2d[0] + fmul v30.2d, v2.2d, v11.d[0] + fmul v19.2d, v3.2d, v8.d[0] .endm .macro KERNEL8x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v15.d[0] ldp d8, d9, [pB] add pB, pB, #16 - fmla v18.2d, v6.2d, v12.2d[0] - fmla v31.2d, v7.2d, v15.2d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v31.2d, v7.2d, v15.d[0] ldp d10, d11, [pB] add pB, pB, #16 - fmla v20.2d, v4.2d, v13.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v22.2d, v6.2d, v13.2d[0] - fmla v27.2d, v7.2d, v14.2d[0] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] + fmla v22.2d, v6.2d, v13.d[0] + fmla v27.2d, v7.2d, v14.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v21.2d, v5.2d, v13.d[0] ldp q0, q1, [pA] add pA, pA, #32 - fmla v26.2d, v6.2d, v14.2d[0] - fmla v23.2d, v7.2d, v13.2d[0] - fmla v28.2d, v4.2d, v15.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v23.2d, v7.2d, v13.d[0] + fmla v28.2d, v4.2d, v15.d[0] + fmla v17.2d, v5.2d, v12.d[0] ldp q2, q3, [ppA] add ppA, ppA, #32 - fmla v30.2d, v6.2d, v15.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] + fmla v30.2d, v6.2d, v15.d[0] + fmla v19.2d, v7.2d, v12.d[0] .endm .macro KERNEL8x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v11.d[0] ldp d12, d13, [pB] add pB, pB, #16 - fmla v18.2d, v2.2d, v8.2d[0] - fmla v31.2d, v3.2d, v11.2d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v31.2d, v3.2d, v11.d[0] ldp d14, d15, [pB] add pB, pB, #16 - fmla v20.2d, v0.2d, v9.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] + fmla v20.2d, v0.2d, v9.d[0] + fmla v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - fmla v22.2d, v2.2d, v9.2d[0] - fmla v27.2d, v3.2d, v10.2d[0] + fmla v22.2d, v2.2d, v9.d[0] + fmla v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] + fmla v24.2d, v0.2d, v10.d[0] + fmla v21.2d, v1.2d, v9.d[0] ldp q4, q5, [pA] add pA, pA, #32 - fmla v26.2d, v2.2d, v10.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] + fmla v26.2d, v2.2d, v10.d[0] + fmla v23.2d, v3.2d, v9.d[0] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v11.d[0] + fmla v17.2d, v1.2d, v8.d[0] ldp q6, q7, [ppA] add ppA, ppA, #32 - fmla v30.2d, v2.2d, v11.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v30.2d, v2.2d, v11.d[0] + fmla v19.2d, v3.2d, v8.d[0] .endm .macro KERNEL8x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v18.2d, v6.2d, v12.2d[0] - fmla v27.2d, v7.2d, v14.2d[0] - - fmla v20.2d, v4.2d, v13.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v22.2d, v6.2d, v13.2d[0] - fmla v31.2d, v7.2d, v15.2d[0] - - fmla v24.2d, v4.2d, v14.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v26.2d, v6.2d, v14.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] - - fmla v28.2d, v4.2d, v15.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v30.2d, v6.2d, v15.2d[0] - fmla v23.2d, v7.2d, v13.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v27.2d, v7.2d, v14.d[0] + + fmla v20.2d, v4.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] + fmla v22.2d, v6.2d, v13.d[0] + fmla v31.2d, v7.2d, v15.d[0] + + fmla v24.2d, v4.2d, v14.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v19.2d, v7.2d, v12.d[0] + + fmla v28.2d, v4.2d, v15.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v30.2d, v6.2d, v15.d[0] + fmla v23.2d, v7.2d, v13.d[0] .endm .macro KERNEL8x4_SUB @@ -315,28 +315,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldp q0, q1, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v11.d[0] + fmla v20.2d, v0.2d, v9.d[0] + fmla v25.2d, v1.2d, v10.d[0] ldp q2, q3, [ppA] add ppA, ppA, #32 - fmla v24.2d, v0.2d, v10.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v24.2d, v0.2d, v10.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v28.2d, v0.2d, v11.d[0] + fmla v17.2d, v1.2d, v8.d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v31.2d, v3.2d, v11.2d[0] - fmla v22.2d, v2.2d, v9.2d[0] - fmla v27.2d, v3.2d, v10.2d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v31.2d, v3.2d, v11.d[0] + fmla v22.2d, v2.2d, v9.d[0] + fmla v27.2d, v3.2d, v10.d[0] - fmla v26.2d, v2.2d, v10.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] - fmla v30.2d, v2.2d, v11.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v26.2d, v2.2d, v10.d[0] + fmla v23.2d, v3.2d, v9.d[0] + fmla v30.2d, v2.2d, v11.d[0] + fmla v19.2d, v3.2d, v8.d[0] .endm .macro SAVE8x4 @@ -422,17 +422,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -482,10 +482,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -572,10 +572,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -610,8 +610,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -643,7 +643,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -674,8 +674,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -705,7 +705,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/dgemm_kernel_4x8.S b/kernel/arm64/dgemm_kernel_4x8.S old mode 100755 new mode 100644 index 88e9a773d..b04dbb5d5 --- a/kernel/arm64/dgemm_kernel_4x8.S +++ b/kernel/arm64/dgemm_kernel_4x8.S @@ -154,25 +154,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] - fmul v18.2d, v0.2d, v8.2d[1] - fmul v19.2d, v1.2d, v8.2d[1] - - fmul v20.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v9.2d[0] - fmul v22.2d, v0.2d, v9.2d[1] - fmul v23.2d, v1.2d, v9.2d[1] - - fmul v24.2d, v0.2d, v10.2d[0] - fmul v25.2d, v1.2d, v10.2d[0] - fmul v26.2d, v0.2d, v10.2d[1] - fmul v27.2d, v1.2d, v10.2d[1] - - fmul v28.2d, v0.2d, v11.2d[0] - fmul v29.2d, v1.2d, v11.2d[0] - fmul v30.2d, v0.2d, v11.2d[1] - fmul v31.2d, v1.2d, v11.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v17.2d, v1.2d, v8.d[0] + fmul v18.2d, v0.2d, v8.d[1] + fmul v19.2d, v1.2d, v8.d[1] + + fmul v20.2d, v0.2d, v9.d[0] + fmul v21.2d, v1.2d, v9.d[0] + fmul v22.2d, v0.2d, v9.d[1] + fmul v23.2d, v1.2d, v9.d[1] + + fmul v24.2d, v0.2d, v10.d[0] + fmul v25.2d, v1.2d, v10.d[0] + fmul v26.2d, v0.2d, v10.d[1] + fmul v27.2d, v1.2d, v10.d[1] + + fmul v28.2d, v0.2d, v11.d[0] + fmul v29.2d, v1.2d, v11.d[0] + fmul v30.2d, v0.2d, v11.d[1] + fmul v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -183,25 +183,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] - fmla v19.2d, v1.2d, v8.2d[1] - - fmla v20.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] - fmla v23.2d, v1.2d, v9.2d[1] - - fmla v24.2d, v0.2d, v10.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] - fmla v27.2d, v1.2d, v10.2d[1] - - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] - fmla v31.2d, v1.2d, v11.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] + fmla v19.2d, v1.2d, v8.d[1] + + fmla v20.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] + fmla v23.2d, v1.2d, v9.d[1] + + fmla v24.2d, v0.2d, v10.d[0] + fmla v25.2d, v1.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] + fmla v27.2d, v1.2d, v10.d[1] + + fmla v28.2d, v0.2d, v11.d[0] + fmla v29.2d, v1.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] + fmla v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 @@ -214,25 +214,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v4.2d, v12.2d[1] - fmla v19.2d, v5.2d, v12.2d[1] - - fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v4.2d, v13.2d[1] - fmla v23.2d, v5.2d, v13.2d[1] - - fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v4.2d, v14.2d[1] - fmla v27.2d, v5.2d, v14.2d[1] - - fmla v28.2d, v4.2d, v15.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v30.2d, v4.2d, v15.2d[1] - fmla v31.2d, v5.2d, v15.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v4.2d, v12.d[1] + fmla v19.2d, v5.2d, v12.d[1] + + fmla v20.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v22.2d, v4.2d, v13.d[1] + fmla v23.2d, v5.2d, v13.d[1] + + fmla v24.2d, v4.2d, v14.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v26.2d, v4.2d, v14.d[1] + fmla v27.2d, v5.2d, v14.d[1] + + fmla v28.2d, v4.2d, v15.d[0] + fmla v29.2d, v5.2d, v15.d[0] + fmla v30.2d, v4.2d, v15.d[1] + fmla v31.2d, v5.2d, v15.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 @@ -245,25 +245,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v4.2d, v12.2d[1] - fmla v19.2d, v5.2d, v12.2d[1] - - fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v4.2d, v13.2d[1] - fmla v23.2d, v5.2d, v13.2d[1] - - fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v4.2d, v14.2d[1] - fmla v27.2d, v5.2d, v14.2d[1] - - fmla v28.2d, v4.2d, v15.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v30.2d, v4.2d, v15.2d[1] - fmla v31.2d, v5.2d, v15.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v4.2d, v12.d[1] + fmla v19.2d, v5.2d, v12.d[1] + + fmla v20.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v22.2d, v4.2d, v13.d[1] + fmla v23.2d, v5.2d, v13.d[1] + + fmla v24.2d, v4.2d, v14.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v26.2d, v4.2d, v14.d[1] + fmla v27.2d, v5.2d, v14.d[1] + + fmla v28.2d, v4.2d, v15.d[0] + fmla v29.2d, v5.2d, v15.d[0] + fmla v30.2d, v4.2d, v15.d[1] + fmla v31.2d, v5.2d, v15.d[1] .endm .macro KERNEL4x8_SUB @@ -274,25 +274,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] - fmla v19.2d, v1.2d, v8.2d[1] - - fmla v20.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] - fmla v23.2d, v1.2d, v9.2d[1] - - fmla v24.2d, v0.2d, v10.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] - fmla v27.2d, v1.2d, v10.2d[1] - - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] - fmla v31.2d, v1.2d, v11.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] + fmla v19.2d, v1.2d, v8.d[1] + + fmla v20.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] + fmla v23.2d, v1.2d, v9.d[1] + + fmla v24.2d, v0.2d, v10.d[0] + fmla v25.2d, v1.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] + fmla v27.2d, v1.2d, v10.d[1] + + fmla v28.2d, v0.2d, v11.d[0] + fmla v29.2d, v1.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] + fmla v31.2d, v1.2d, v11.d[1] .endm .macro SAVE4x8 @@ -374,17 +374,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] + fmla v20.2d, v0.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] + fmla v24.2d, v0.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] + fmla v28.2d, v0.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] .endm .macro SAVE2x8 @@ -520,17 +520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v29.2d, v1.2d, v9.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v29.2d, v1.2d, v9.d[1] - fmul v20.2d, v0.2d, v8.2d[1] - fmul v25.2d, v1.2d, v9.2d[0] + fmul v20.2d, v0.2d, v8.d[1] + fmul v25.2d, v1.2d, v9.d[0] - fmul v24.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v8.2d[1] + fmul v24.2d, v0.2d, v9.d[0] + fmul v21.2d, v1.2d, v8.d[1] - fmul v28.2d, v0.2d, v9.2d[1] - fmul v17.2d, v1.2d, v8.2d[0] + fmul v28.2d, v0.2d, v9.d[1] + fmul v17.2d, v1.2d, v8.d[0] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -539,61 +539,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] ld1 {v4.2d, v5.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro KERNEL4x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_SUB @@ -602,17 +602,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -660,10 +660,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -746,10 +746,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -782,8 +782,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -813,7 +813,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -842,8 +842,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -871,7 +871,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/dgemm_kernel_8x4.S b/kernel/arm64/dgemm_kernel_8x4.S old mode 100755 new mode 100644 index 33e076e6a..f3c3d5c35 --- a/kernel/arm64/dgemm_kernel_8x4.S +++ b/kernel/arm64/dgemm_kernel_8x4.S @@ -151,141 +151,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldp d8, d9, [pB], #16 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v20.2d, v0.2d, v9.2d[0] + fmul v16.2d, v0.2d, v8.d[0] + fmul v20.2d, v0.2d, v9.d[0] ldp d10, d11, [pB], #16 - fmul v17.2d, v1.2d, v8.2d[0] - fmul v21.2d, v1.2d, v9.2d[0] + fmul v17.2d, v1.2d, v8.d[0] + fmul v21.2d, v1.2d, v9.d[0] ldp q2, q3, [pA], #32 - fmul v24.2d, v0.2d, v10.2d[0] - fmul v28.2d, v0.2d, v11.2d[0] + fmul v24.2d, v0.2d, v10.d[0] + fmul v28.2d, v0.2d, v11.d[0] ldp q4, q5, [pA], #32 - fmul v25.2d, v1.2d, v10.2d[0] - fmul v29.2d, v1.2d, v11.2d[0] + fmul v25.2d, v1.2d, v10.d[0] + fmul v29.2d, v1.2d, v11.d[0] ldp d12, d13, [pB], #16 - fmul v18.2d, v2.2d, v8.2d[0] - fmul v22.2d, v2.2d, v9.2d[0] + fmul v18.2d, v2.2d, v8.d[0] + fmul v22.2d, v2.2d, v9.d[0] ldp d14, d15, [pB], #16 - fmul v26.2d, v2.2d, v10.2d[0] - fmul v30.2d, v2.2d, v11.2d[0] + fmul v26.2d, v2.2d, v10.d[0] + fmul v30.2d, v2.2d, v11.d[0] ldp q6, q7, [pA], #32 - fmul v19.2d, v3.2d, v8.2d[0] - fmul v27.2d, v3.2d, v10.2d[0] + fmul v19.2d, v3.2d, v8.d[0] + fmul v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - fmul v31.2d, v3.2d, v11.2d[0] - fmul v23.2d, v3.2d, v9.2d[0] + fmul v31.2d, v3.2d, v11.d[0] + fmul v23.2d, v3.2d, v9.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL8x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v9.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v9.d[0] ldp q4, q5, [pA], #32 - fmla v24.2d, v0.2d, v10.2d[0] - fmla v28.2d, v0.2d, v11.2d[0] + fmla v24.2d, v0.2d, v10.d[0] + fmla v28.2d, v0.2d, v11.d[0] ldp d12, d13, [pB], #16 - fmla v17.2d, v1.2d, v8.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v29.2d, v1.2d, v11.d[0] ldp d14, d15, [pB], #16 - fmla v18.2d, v2.2d, v8.2d[0] - fmla v22.2d, v2.2d, v9.2d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v22.2d, v2.2d, v9.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - fmla v26.2d, v2.2d, v10.2d[0] - fmla v30.2d, v2.2d, v11.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] + fmla v26.2d, v2.2d, v10.d[0] + fmla v30.2d, v2.2d, v11.d[0] + fmla v19.2d, v3.2d, v8.d[0] + fmla v23.2d, v3.2d, v9.d[0] ldp q6, q7, [pA], #32 - fmla v27.2d, v3.2d, v10.2d[0] - fmla v31.2d, v3.2d, v11.2d[0] + fmla v27.2d, v3.2d, v10.d[0] + fmla v31.2d, v3.2d, v11.d[0] .endm .macro KERNEL8x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v20.2d, v4.2d, v13.2d[0] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v28.2d, v4.2d, v15.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v28.2d, v4.2d, v15.d[0] ldp q0, q1, [pA], #32 - fmla v17.2d, v5.2d, v12.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] ldp d8, d9, [pB], #16 - fmla v21.2d, v5.2d, v13.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] ldp d10, d11, [pB], #16 - fmla v18.2d, v6.2d, v12.2d[0] - fmla v22.2d, v6.2d, v13.2d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v22.2d, v6.2d, v13.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v26.2d, v6.2d, v14.2d[0] - fmla v30.2d, v6.2d, v15.2d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v30.2d, v6.2d, v15.d[0] - fmla v19.2d, v7.2d, v12.2d[0] - fmla v23.2d, v7.2d, v13.2d[0] + fmla v19.2d, v7.2d, v12.d[0] + fmla v23.2d, v7.2d, v13.d[0] ldp q2, q3, [pA], #32 - fmla v27.2d, v7.2d, v14.2d[0] - fmla v31.2d, v7.2d, v15.2d[0] + fmla v27.2d, v7.2d, v14.d[0] + fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v20.2d, v4.2d, v13.2d[0] - fmla v24.2d, v4.2d, v14.2d[0] - fmla v28.2d, v4.2d, v15.2d[0] + fmla v16.2d, v4.2d, v12.d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v28.2d, v4.2d, v15.d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v18.2d, v6.2d, v12.2d[0] - fmla v22.2d, v6.2d, v13.2d[0] - fmla v26.2d, v6.2d, v14.2d[0] - fmla v30.2d, v6.2d, v15.2d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v22.2d, v6.2d, v13.d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v30.2d, v6.2d, v15.d[0] - fmla v19.2d, v7.2d, v12.2d[0] - fmla v23.2d, v7.2d, v13.2d[0] - fmla v27.2d, v7.2d, v14.2d[0] - fmla v31.2d, v7.2d, v15.2d[0] + fmla v19.2d, v7.2d, v12.d[0] + fmla v23.2d, v7.2d, v13.d[0] + fmla v27.2d, v7.2d, v14.d[0] + fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_SUB @@ -293,39 +293,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldp d8, d9, [pB], #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v9.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v9.d[0] ldp d10, d11, [pB], #16 - fmla v17.2d, v1.2d, v8.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v21.2d, v1.2d, v9.d[0] ldp q2, q3, [pA], #32 - fmla v24.2d, v0.2d, v10.2d[0] - fmla v28.2d, v0.2d, v11.2d[0] + fmla v24.2d, v0.2d, v10.d[0] + fmla v28.2d, v0.2d, v11.d[0] - fmla v25.2d, v1.2d, v10.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] + fmla v25.2d, v1.2d, v10.d[0] + fmla v29.2d, v1.2d, v11.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v22.2d, v2.2d, v9.2d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v22.2d, v2.2d, v9.d[0] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - fmla v26.2d, v2.2d, v10.2d[0] - fmla v30.2d, v2.2d, v11.2d[0] + fmla v26.2d, v2.2d, v10.d[0] + fmla v30.2d, v2.2d, v11.d[0] prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v19.2d, v3.2d, v8.2d[0] - fmla v27.2d, v3.2d, v10.2d[0] + fmla v19.2d, v3.2d, v8.d[0] + fmla v27.2d, v3.2d, v10.d[0] - fmla v31.2d, v3.2d, v11.2d[0] - fmla v23.2d, v3.2d, v9.2d[0] + fmla v31.2d, v3.2d, v11.d[0] + fmla v23.2d, v3.2d, v9.d[0] .endm .macro SAVE8x4 @@ -419,17 +419,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -479,10 +479,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -573,15 +573,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] - fmla v22.2d, v2.2d, v8.2d[1] - fmla v23.2d, v3.2d, v8.2d[1] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] + fmla v22.2d, v2.2d, v8.d[1] + fmla v23.2d, v3.2d, v8.d[1] .endm .macro SAVE8x2 @@ -620,10 +620,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -657,8 +657,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -689,7 +689,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -724,10 +724,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] .endm .macro SAVE8x1 @@ -757,8 +757,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -785,7 +785,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/dtrmm_kernel_4x4.S b/kernel/arm64/dtrmm_kernel_4x4.S index 0d1b12881..34fb8c233 100644 --- a/kernel/arm64/dtrmm_kernel_4x4.S +++ b/kernel/arm64/dtrmm_kernel_4x4.S @@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v29.2d, v1.2d, v9.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v29.2d, v1.2d, v9.d[1] - fmul v20.2d, v0.2d, v8.2d[1] - fmul v25.2d, v1.2d, v9.2d[0] + fmul v20.2d, v0.2d, v8.d[1] + fmul v25.2d, v1.2d, v9.d[0] - fmul v24.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v8.2d[1] + fmul v24.2d, v0.2d, v9.d[0] + fmul v21.2d, v1.2d, v8.d[1] - fmul v28.2d, v0.2d, v9.2d[1] - fmul v17.2d, v1.2d, v8.2d[0] + fmul v28.2d, v0.2d, v9.d[1] + fmul v17.2d, v1.2d, v8.d[0] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] ld1 {v4.2d, v5.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro KERNEL4x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_SUB @@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -283,10 +283,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -361,10 +361,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -395,8 +395,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -424,7 +424,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -451,8 +451,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -479,7 +479,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/dtrmm_kernel_4x8.S b/kernel/arm64/dtrmm_kernel_4x8.S old mode 100755 new mode 100644 index eb7397faa..4aecf28eb --- a/kernel/arm64/dtrmm_kernel_4x8.S +++ b/kernel/arm64/dtrmm_kernel_4x8.S @@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] - fmul v18.2d, v0.2d, v8.2d[1] - fmul v19.2d, v1.2d, v8.2d[1] - - fmul v20.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v9.2d[0] - fmul v22.2d, v0.2d, v9.2d[1] - fmul v23.2d, v1.2d, v9.2d[1] - - fmul v24.2d, v0.2d, v10.2d[0] - fmul v25.2d, v1.2d, v10.2d[0] - fmul v26.2d, v0.2d, v10.2d[1] - fmul v27.2d, v1.2d, v10.2d[1] - - fmul v28.2d, v0.2d, v11.2d[0] - fmul v29.2d, v1.2d, v11.2d[0] - fmul v30.2d, v0.2d, v11.2d[1] - fmul v31.2d, v1.2d, v11.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v17.2d, v1.2d, v8.d[0] + fmul v18.2d, v0.2d, v8.d[1] + fmul v19.2d, v1.2d, v8.d[1] + + fmul v20.2d, v0.2d, v9.d[0] + fmul v21.2d, v1.2d, v9.d[0] + fmul v22.2d, v0.2d, v9.d[1] + fmul v23.2d, v1.2d, v9.d[1] + + fmul v24.2d, v0.2d, v10.d[0] + fmul v25.2d, v1.2d, v10.d[0] + fmul v26.2d, v0.2d, v10.d[1] + fmul v27.2d, v1.2d, v10.d[1] + + fmul v28.2d, v0.2d, v11.d[0] + fmul v29.2d, v1.2d, v11.d[0] + fmul v30.2d, v0.2d, v11.d[1] + fmul v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] - fmla v19.2d, v1.2d, v8.2d[1] - - fmla v20.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] - fmla v23.2d, v1.2d, v9.2d[1] - - fmla v24.2d, v0.2d, v10.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] - fmla v27.2d, v1.2d, v10.2d[1] - - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] - fmla v31.2d, v1.2d, v11.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] + fmla v19.2d, v1.2d, v8.d[1] + + fmla v20.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] + fmla v23.2d, v1.2d, v9.d[1] + + fmla v24.2d, v0.2d, v10.d[0] + fmla v25.2d, v1.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] + fmla v27.2d, v1.2d, v10.d[1] + + fmla v28.2d, v0.2d, v11.d[0] + fmla v29.2d, v1.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] + fmla v31.2d, v1.2d, v11.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 @@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v4.2d, v12.2d[1] - fmla v19.2d, v5.2d, v12.2d[1] - - fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v4.2d, v13.2d[1] - fmla v23.2d, v5.2d, v13.2d[1] - - fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v4.2d, v14.2d[1] - fmla v27.2d, v5.2d, v14.2d[1] - - fmla v28.2d, v4.2d, v15.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v30.2d, v4.2d, v15.2d[1] - fmla v31.2d, v5.2d, v15.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v4.2d, v12.d[1] + fmla v19.2d, v5.2d, v12.d[1] + + fmla v20.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v22.2d, v4.2d, v13.d[1] + fmla v23.2d, v5.2d, v13.d[1] + + fmla v24.2d, v4.2d, v14.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v26.2d, v4.2d, v14.d[1] + fmla v27.2d, v5.2d, v14.d[1] + + fmla v28.2d, v4.2d, v15.d[0] + fmla v29.2d, v5.2d, v15.d[0] + fmla v30.2d, v4.2d, v15.d[1] + fmla v31.2d, v5.2d, v15.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 @@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v4.2d, v12.2d[1] - fmla v19.2d, v5.2d, v12.2d[1] - - fmla v20.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v13.2d[0] - fmla v22.2d, v4.2d, v13.2d[1] - fmla v23.2d, v5.2d, v13.2d[1] - - fmla v24.2d, v4.2d, v14.2d[0] - fmla v25.2d, v5.2d, v14.2d[0] - fmla v26.2d, v4.2d, v14.2d[1] - fmla v27.2d, v5.2d, v14.2d[1] - - fmla v28.2d, v4.2d, v15.2d[0] - fmla v29.2d, v5.2d, v15.2d[0] - fmla v30.2d, v4.2d, v15.2d[1] - fmla v31.2d, v5.2d, v15.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v4.2d, v12.d[1] + fmla v19.2d, v5.2d, v12.d[1] + + fmla v20.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v22.2d, v4.2d, v13.d[1] + fmla v23.2d, v5.2d, v13.d[1] + + fmla v24.2d, v4.2d, v14.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v26.2d, v4.2d, v14.d[1] + fmla v27.2d, v5.2d, v14.d[1] + + fmla v28.2d, v4.2d, v15.d[0] + fmla v29.2d, v5.2d, v15.d[0] + fmla v30.2d, v4.2d, v15.d[1] + fmla v31.2d, v5.2d, v15.d[1] .endm .macro KERNEL4x8_SUB @@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] - fmla v19.2d, v1.2d, v8.2d[1] - - fmla v20.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] - fmla v23.2d, v1.2d, v9.2d[1] - - fmla v24.2d, v0.2d, v10.2d[0] - fmla v25.2d, v1.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] - fmla v27.2d, v1.2d, v10.2d[1] - - fmla v28.2d, v0.2d, v11.2d[0] - fmla v29.2d, v1.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] - fmla v31.2d, v1.2d, v11.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] + fmla v19.2d, v1.2d, v8.d[1] + + fmla v20.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] + fmla v23.2d, v1.2d, v9.d[1] + + fmla v24.2d, v0.2d, v10.d[0] + fmla v25.2d, v1.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] + fmla v27.2d, v1.2d, v10.d[1] + + fmla v28.2d, v0.2d, v11.d[0] + fmla v29.2d, v1.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] + fmla v31.2d, v1.2d, v11.d[1] .endm .macro SAVE4x8 @@ -369,17 +369,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v10.2d, v11.2d}, [pB] add pB, pB, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v18.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v18.2d, v0.2d, v8.d[1] - fmla v20.2d, v0.2d, v9.2d[0] - fmla v22.2d, v0.2d, v9.2d[1] + fmla v20.2d, v0.2d, v9.d[0] + fmla v22.2d, v0.2d, v9.d[1] - fmla v24.2d, v0.2d, v10.2d[0] - fmla v26.2d, v0.2d, v10.2d[1] + fmla v24.2d, v0.2d, v10.d[0] + fmla v26.2d, v0.2d, v10.d[1] - fmla v28.2d, v0.2d, v11.2d[0] - fmla v30.2d, v0.2d, v11.2d[1] + fmla v28.2d, v0.2d, v11.d[0] + fmla v30.2d, v0.2d, v11.d[1] .endm .macro SAVE2x8 @@ -499,17 +499,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v29.2d, v1.2d, v9.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v29.2d, v1.2d, v9.d[1] - fmul v20.2d, v0.2d, v8.2d[1] - fmul v25.2d, v1.2d, v9.2d[0] + fmul v20.2d, v0.2d, v8.d[1] + fmul v25.2d, v1.2d, v9.d[0] - fmul v24.2d, v0.2d, v9.2d[0] - fmul v21.2d, v1.2d, v8.2d[1] + fmul v24.2d, v0.2d, v9.d[0] + fmul v21.2d, v1.2d, v8.d[1] - fmul v28.2d, v0.2d, v9.2d[1] - fmul v17.2d, v1.2d, v8.2d[0] + fmul v28.2d, v0.2d, v9.d[1] + fmul v17.2d, v1.2d, v8.d[0] ld1 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -518,61 +518,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] ld1 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] ld1 {v4.2d, v5.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro KERNEL4x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] ld1 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] ld1 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v29.2d, v5.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v29.2d, v5.2d, v13.d[1] - fmla v20.2d, v4.2d, v12.2d[1] - fmla v25.2d, v5.2d, v13.2d[0] + fmla v20.2d, v4.2d, v12.d[1] + fmla v25.2d, v5.2d, v13.d[0] - fmla v24.2d, v4.2d, v13.2d[0] - fmla v21.2d, v5.2d, v12.2d[1] + fmla v24.2d, v4.2d, v13.d[0] + fmla v21.2d, v5.2d, v12.d[1] - fmla v28.2d, v4.2d, v13.2d[1] - fmla v17.2d, v5.2d, v12.2d[0] + fmla v28.2d, v4.2d, v13.d[1] + fmla v17.2d, v5.2d, v12.d[0] .endm .macro KERNEL4x4_SUB @@ -581,17 +581,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -635,10 +635,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -713,10 +713,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -747,8 +747,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -776,7 +776,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -803,8 +803,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -831,7 +831,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S old mode 100755 new mode 100644 index 6890505bd..b06c7560d --- a/kernel/arm64/dtrmm_kernel_8x4.S +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - fmul v17.2d, v1.2d, v8.2d[0] - fmul v18.2d, v2.2d, v8.2d[0] - fmul v19.2d, v3.2d, v8.2d[0] - - fmul v20.2d, v0.2d, v8.2d[1] - fmul v21.2d, v1.2d, v8.2d[1] - fmul v22.2d, v2.2d, v8.2d[1] - fmul v23.2d, v3.2d, v8.2d[1] - - fmul v24.2d, v0.2d, v9.2d[0] - fmul v25.2d, v1.2d, v9.2d[0] - fmul v26.2d, v2.2d, v9.2d[0] - fmul v27.2d, v3.2d, v9.2d[0] - - fmul v28.2d, v0.2d, v9.2d[1] - fmul v29.2d, v1.2d, v9.2d[1] - fmul v30.2d, v2.2d, v9.2d[1] - fmul v31.2d, v3.2d, v9.2d[1] + fmul v16.2d, v0.2d, v8.d[0] + fmul v17.2d, v1.2d, v8.d[0] + fmul v18.2d, v2.2d, v8.d[0] + fmul v19.2d, v3.2d, v8.d[0] + + fmul v20.2d, v0.2d, v8.d[1] + fmul v21.2d, v1.2d, v8.d[1] + fmul v22.2d, v2.2d, v8.d[1] + fmul v23.2d, v3.2d, v8.d[1] + + fmul v24.2d, v0.2d, v9.d[0] + fmul v25.2d, v1.2d, v9.d[0] + fmul v26.2d, v2.2d, v9.d[0] + fmul v27.2d, v3.2d, v9.d[0] + + fmul v28.2d, v0.2d, v9.d[1] + fmul v29.2d, v1.2d, v9.d[1] + fmul v30.2d, v2.2d, v9.d[1] + fmul v31.2d, v3.2d, v9.d[1] ld1 {v4.2d, v5.2d}, [pA] add pA, pA, #32 @@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] - - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] - fmla v22.2d, v2.2d, v8.2d[1] - fmla v23.2d, v3.2d, v8.2d[1] - - fmla v24.2d, v0.2d, v9.2d[0] - fmla v25.2d, v1.2d, v9.2d[0] - fmla v26.2d, v2.2d, v9.2d[0] - fmla v27.2d, v3.2d, v9.2d[0] - - fmla v28.2d, v0.2d, v9.2d[1] - fmla v29.2d, v1.2d, v9.2d[1] - fmla v30.2d, v2.2d, v9.2d[1] - fmla v31.2d, v3.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] + + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] + fmla v22.2d, v2.2d, v8.d[1] + fmla v23.2d, v3.2d, v8.d[1] + + fmla v24.2d, v0.2d, v9.d[0] + fmla v25.2d, v1.2d, v9.d[0] + fmla v26.2d, v2.2d, v9.d[0] + fmla v27.2d, v3.2d, v9.d[0] + + fmla v28.2d, v0.2d, v9.d[1] + fmla v29.2d, v1.2d, v9.d[1] + fmla v30.2d, v2.2d, v9.d[1] + fmla v31.2d, v3.2d, v9.d[1] ld1 {v4.2d, v5.2d}, [pA] add pA, pA, #32 @@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v6.2d, v12.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] - - fmla v20.2d, v4.2d, v12.2d[1] - fmla v21.2d, v5.2d, v12.2d[1] - fmla v22.2d, v6.2d, v12.2d[1] - fmla v23.2d, v7.2d, v12.2d[1] - - fmla v24.2d, v4.2d, v13.2d[0] - fmla v25.2d, v5.2d, v13.2d[0] - fmla v26.2d, v6.2d, v13.2d[0] - fmla v27.2d, v7.2d, v13.2d[0] - - fmla v28.2d, v4.2d, v13.2d[1] - fmla v29.2d, v5.2d, v13.2d[1] - fmla v30.2d, v6.2d, v13.2d[1] - fmla v31.2d, v7.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v19.2d, v7.2d, v12.d[0] + + fmla v20.2d, v4.2d, v12.d[1] + fmla v21.2d, v5.2d, v12.d[1] + fmla v22.2d, v6.2d, v12.d[1] + fmla v23.2d, v7.2d, v12.d[1] + + fmla v24.2d, v4.2d, v13.d[0] + fmla v25.2d, v5.2d, v13.d[0] + fmla v26.2d, v6.2d, v13.d[0] + fmla v27.2d, v7.2d, v13.d[0] + + fmla v28.2d, v4.2d, v13.d[1] + fmla v29.2d, v5.2d, v13.d[1] + fmla v30.2d, v6.2d, v13.d[1] + fmla v31.2d, v7.2d, v13.d[1] ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 @@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - fmla v16.2d, v4.2d, v12.2d[0] - fmla v17.2d, v5.2d, v12.2d[0] - fmla v18.2d, v6.2d, v12.2d[0] - fmla v19.2d, v7.2d, v12.2d[0] - - fmla v20.2d, v4.2d, v12.2d[1] - fmla v21.2d, v5.2d, v12.2d[1] - fmla v22.2d, v6.2d, v12.2d[1] - fmla v23.2d, v7.2d, v12.2d[1] - - fmla v24.2d, v4.2d, v13.2d[0] - fmla v25.2d, v5.2d, v13.2d[0] - fmla v26.2d, v6.2d, v13.2d[0] - fmla v27.2d, v7.2d, v13.2d[0] - - fmla v28.2d, v4.2d, v13.2d[1] - fmla v29.2d, v5.2d, v13.2d[1] - fmla v30.2d, v6.2d, v13.2d[1] - fmla v31.2d, v7.2d, v13.2d[1] + fmla v16.2d, v4.2d, v12.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v18.2d, v6.2d, v12.d[0] + fmla v19.2d, v7.2d, v12.d[0] + + fmla v20.2d, v4.2d, v12.d[1] + fmla v21.2d, v5.2d, v12.d[1] + fmla v22.2d, v6.2d, v12.d[1] + fmla v23.2d, v7.2d, v12.d[1] + + fmla v24.2d, v4.2d, v13.d[0] + fmla v25.2d, v5.2d, v13.d[0] + fmla v26.2d, v6.2d, v13.d[0] + fmla v27.2d, v7.2d, v13.d[0] + + fmla v28.2d, v4.2d, v13.d[1] + fmla v29.2d, v5.2d, v13.d[1] + fmla v30.2d, v6.2d, v13.d[1] + fmla v31.2d, v7.2d, v13.d[1] .endm .macro KERNEL8x4_SUB @@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] - - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] - fmla v22.2d, v2.2d, v8.2d[1] - fmla v23.2d, v3.2d, v8.2d[1] - - fmla v24.2d, v0.2d, v9.2d[0] - fmla v25.2d, v1.2d, v9.2d[0] - fmla v26.2d, v2.2d, v9.2d[0] - fmla v27.2d, v3.2d, v9.2d[0] - - fmla v28.2d, v0.2d, v9.2d[1] - fmla v29.2d, v1.2d, v9.2d[1] - fmla v30.2d, v2.2d, v9.2d[1] - fmla v31.2d, v3.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] + + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] + fmla v22.2d, v2.2d, v8.d[1] + fmla v23.2d, v3.2d, v8.d[1] + + fmla v24.2d, v0.2d, v9.d[0] + fmla v25.2d, v1.2d, v9.d[0] + fmla v26.2d, v2.2d, v9.d[0] + fmla v27.2d, v3.2d, v9.d[0] + + fmla v28.2d, v0.2d, v9.d[1] + fmla v29.2d, v1.2d, v9.d[1] + fmla v30.2d, v2.2d, v9.d[1] + fmla v31.2d, v3.2d, v9.d[1] .endm .macro SAVE8x4 @@ -351,17 +351,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v29.2d, v1.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v29.2d, v1.2d, v9.d[1] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v25.2d, v1.2d, v9.2d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v25.2d, v1.2d, v9.d[0] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v21.2d, v1.2d, v8.d[1] - fmla v28.2d, v0.2d, v9.2d[1] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v28.2d, v0.2d, v9.d[1] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x4 @@ -406,10 +406,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v24.2d, v0.2d, v9.2d[0] - fmla v28.2d, v0.2d, v9.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v24.2d, v0.2d, v9.d[0] + fmla v28.2d, v0.2d, v9.d[1] .endm .macro SAVE2x4 @@ -490,15 +490,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] - fmla v22.2d, v2.2d, v8.2d[1] - fmla v23.2d, v3.2d, v8.2d[1] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] + fmla v22.2d, v2.2d, v8.d[1] + fmla v23.2d, v3.2d, v8.d[1] .endm .macro SAVE8x2 @@ -534,10 +534,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] - fmla v21.2d, v1.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] + fmla v21.2d, v1.2d, v8.d[1] .endm .macro SAVE4x2 @@ -568,8 +568,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA, pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v20.2d, v0.2d, v8.2d[1] + fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v8.d[1] .endm .macro SAVE2x2 @@ -597,7 +597,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr d0 , [pA] add pA, pA, #8 - fmla v16.2d, v8.2d, v0.2d[0] + fmla v16.2d, v8.2d, v0.d[0] .endm .macro SAVE1x2 @@ -629,10 +629,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] - fmla v18.2d, v2.2d, v8.2d[0] - fmla v19.2d, v3.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] + fmla v18.2d, v2.2d, v8.d[0] + fmla v19.2d, v3.2d, v8.d[0] .endm .macro SAVE8x1 @@ -660,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d, v1.2d}, [pA] add pA , pA, #32 - fmla v16.2d, v0.2d, v8.2d[0] - fmla v17.2d, v1.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] + fmla v17.2d, v1.2d, v8.d[0] .endm .macro SAVE4x1 @@ -686,7 +686,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2d}, [pA] add pA , pA, #16 - fmla v16.2d, v0.2d, v8.2d[0] + fmla v16.2d, v0.2d, v8.d[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S index 22b55b01c..68366d9f2 100644 --- a/kernel/arm64/sgemm_kernel_16x4.S +++ b/kernel/arm64/sgemm_kernel_16x4.S @@ -158,25 +158,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v18.4s, v2.4s, v8.2s[0] - fmul v19.4s, v3.4s, v8.2s[0] - - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v22.4s, v2.4s, v8.2s[1] - fmul v23.4s, v3.4s, v8.2s[1] - - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v26.4s, v2.4s, v9.2s[0] - fmul v27.4s, v3.4s, v9.2s[0] - - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] - fmul v30.4s, v2.4s, v9.2s[1] - fmul v31.4s, v3.4s, v9.2s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v18.4s, v2.4s, v8.s[0] + fmul v19.4s, v3.4s, v8.s[0] + + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v22.4s, v2.4s, v8.s[1] + fmul v23.4s, v3.4s, v8.s[1] + + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v26.4s, v2.4s, v9.s[0] + fmul v27.4s, v3.4s, v9.s[0] + + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] + fmul v30.4s, v2.4s, v9.s[1] + fmul v31.4s, v3.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -191,25 +191,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] - - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] - - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v26.4s, v2.4s, v9.2s[0] - fmla v27.4s, v3.4s, v9.2s[0] - - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] - fmla v30.4s, v2.4s, v9.2s[1] - fmla v31.4s, v3.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] + + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] + + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v26.4s, v2.4s, v9.s[0] + fmla v27.4s, v3.4s, v9.s[0] + + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] + fmla v30.4s, v2.4s, v9.s[1] + fmla v31.4s, v3.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -224,25 +224,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v18.4s, v6.4s, v12.2s[0] - fmla v19.4s, v7.4s, v12.2s[0] - - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v22.4s, v6.4s, v12.2s[1] - fmla v23.4s, v7.4s, v12.2s[1] - - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v26.4s, v6.4s, v13.2s[0] - fmla v27.4s, v7.4s, v13.2s[0] - - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] - fmla v30.4s, v6.4s, v13.2s[1] - fmla v31.4s, v7.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] + + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v22.4s, v6.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] + + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v26.4s, v6.4s, v13.s[0] + fmla v27.4s, v7.4s, v13.s[0] + + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] + fmla v30.4s, v6.4s, v13.s[1] + fmla v31.4s, v7.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -257,25 +257,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v18.4s, v6.4s, v12.2s[0] - fmla v19.4s, v7.4s, v12.2s[0] - - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v22.4s, v6.4s, v12.2s[1] - fmla v23.4s, v7.4s, v12.2s[1] - - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v26.4s, v6.4s, v13.2s[0] - fmla v27.4s, v7.4s, v13.2s[0] - - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] - fmla v30.4s, v6.4s, v13.2s[1] - fmla v31.4s, v7.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] + + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v22.4s, v6.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] + + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v26.4s, v6.4s, v13.s[0] + fmla v27.4s, v7.4s, v13.s[0] + + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] + fmla v30.4s, v6.4s, v13.s[1] + fmla v31.4s, v7.4s, v13.s[1] .endm .macro KERNEL16x4_SUB @@ -290,25 +290,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] - - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] - - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v26.4s, v2.4s, v9.2s[0] - fmla v27.4s, v3.4s, v9.2s[0] - - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] - fmla v30.4s, v2.4s, v9.2s[1] - fmla v31.4s, v3.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] + + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] + + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v26.4s, v2.4s, v9.s[0] + fmla v27.4s, v3.4s, v9.s[0] + + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] + fmla v30.4s, v2.4s, v9.s[1] + fmla v31.4s, v3.4s, v9.s[1] .endm .macro SAVE16x4 @@ -370,14 +370,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -388,14 +388,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -406,14 +406,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -424,14 +424,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] .endm .macro KERNEL8x4_SUB @@ -442,14 +442,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] .endm .macro SAVE8x4 @@ -501,17 +501,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmul v16.2s, v0.2s, v8.2s[0] - fmul v29.2s, v1.2s, v9.2s[1] + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] - fmul v20.2s, v0.2s, v8.2s[1] - fmul v25.2s, v1.2s, v9.2s[0] + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] - fmul v24.2s, v0.2s, v9.2s[0] - fmul v21.2s, v1.2s, v8.2s[1] + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] - fmul v28.2s, v0.2s, v9.2s[1] - fmul v17.2s, v1.2s, v8.2s[0] + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -520,61 +520,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB @@ -583,17 +583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -638,10 +638,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -729,15 +729,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] .endm .macro SAVE16x2 @@ -777,11 +777,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 @@ -817,10 +817,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -852,8 +852,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -882,7 +882,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA] add pA, pA, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -918,10 +918,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] .endm .macro SAVE16x1 @@ -951,8 +951,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 @@ -978,8 +978,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -1004,7 +1004,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA , pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S index bfa80d589..a5cf7baff 100644 --- a/kernel/arm64/sgemm_kernel_4x4.S +++ b/kernel/arm64/sgemm_kernel_4x4.S @@ -192,164 +192,164 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA_0] add pA_0, pA_0, #16 - fmul v16.4s, v0.4s, v8.4s[0] - fmul v20.4s, v0.4s, v8.4s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] ld1 {v2.4s}, [pA_1] add pA_1, pA_1, #16 - fmul v24.4s, v0.4s, v8.4s[2] - fmul v28.4s, v0.4s, v8.4s[3] + fmul v24.4s, v0.4s, v8.s[2] + fmul v28.4s, v0.4s, v8.s[3] ld1 {v4.4s}, [pA_2] add pA_2, pA_2, #16 - fmul v17.4s, v2.4s, v8.4s[0] - fmul v21.4s, v2.4s, v8.4s[1] + fmul v17.4s, v2.4s, v8.s[0] + fmul v21.4s, v2.4s, v8.s[1] ld1 {v6.4s}, [pA_3] add pA_3, pA_3, #16 - fmul v25.4s, v2.4s, v8.4s[2] - fmul v29.4s, v2.4s, v8.4s[3] + fmul v25.4s, v2.4s, v8.s[2] + fmul v29.4s, v2.4s, v8.s[3] ld1 {v12.4s}, [pB] // for next round add pB, pB, #16 - fmul v18.4s, v4.4s, v8.4s[0] - fmul v19.4s, v6.4s, v8.4s[0] + fmul v18.4s, v4.4s, v8.s[0] + fmul v19.4s, v6.4s, v8.s[0] ld1 {v1.4s}, [pA_0] // for next round add pA_0, pA_0, #16 - fmul v22.4s, v4.4s, v8.4s[1] - fmul v23.4s, v6.4s, v8.4s[1] + fmul v22.4s, v4.4s, v8.s[1] + fmul v23.4s, v6.4s, v8.s[1] ld1 {v3.4s}, [pA_1] // for next round add pA_1, pA_1, #16 - fmul v26.4s, v4.4s, v8.4s[2] - fmul v27.4s, v6.4s, v8.4s[2] + fmul v26.4s, v4.4s, v8.s[2] + fmul v27.4s, v6.4s, v8.s[2] ld1 {v5.4s}, [pA_2] // for next round add pA_2, pA_2, #16 - fmul v30.4s, v4.4s, v8.4s[3] - fmul v31.4s, v6.4s, v8.4s[3] + fmul v30.4s, v4.4s, v8.s[3] + fmul v31.4s, v6.4s, v8.s[3] ld1 {v7.4s}, [pA_3] // for next round add pA_3, pA_3, #16 .endm .macro KERNEL16x4_M2 - fmla v16.4s, v1.4s, v12.4s[0] - fmla v17.4s, v3.4s, v12.4s[0] + fmla v16.4s, v1.4s, v12.s[0] + fmla v17.4s, v3.4s, v12.s[0] ld1 {v8.4s}, [pB] // for next round add pB, pB, #16 - fmla v18.4s, v5.4s, v12.4s[0] - fmla v19.4s, v7.4s, v12.4s[0] + fmla v18.4s, v5.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] ld1 {v0.4s}, [pA_0] // for next round add pA_0, pA_0, #16 - fmla v20.4s, v1.4s, v12.4s[1] - fmla v21.4s, v3.4s, v12.4s[1] + fmla v20.4s, v1.4s, v12.s[1] + fmla v21.4s, v3.4s, v12.s[1] ld1 {v2.4s}, [pA_1] // for next round add pA_1, pA_1, #16 - fmla v22.4s, v5.4s, v12.4s[1] - fmla v23.4s, v7.4s, v12.4s[1] + fmla v22.4s, v5.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] ld1 {v4.4s}, [pA_2] // for next round add pA_2, pA_2, #16 - fmla v24.4s, v1.4s, v12.4s[2] - fmla v25.4s, v3.4s, v12.4s[2] + fmla v24.4s, v1.4s, v12.s[2] + fmla v25.4s, v3.4s, v12.s[2] ld1 {v6.4s}, [pA_3] // for next round add pA_3, pA_3, #16 - fmla v26.4s, v5.4s, v12.4s[2] - fmla v27.4s, v7.4s, v12.4s[2] + fmla v26.4s, v5.4s, v12.s[2] + fmla v27.4s, v7.4s, v12.s[2] prfm PLDL1KEEP, [pA_2, #512] - fmla v28.4s, v1.4s, v12.4s[3] - fmla v29.4s, v3.4s, v12.4s[3] + fmla v28.4s, v1.4s, v12.s[3] + fmla v29.4s, v3.4s, v12.s[3] prfm PLDL1KEEP, [pA_3, #512] - fmla v30.4s, v5.4s, v12.4s[3] - fmla v31.4s, v7.4s, v12.4s[3] + fmla v30.4s, v5.4s, v12.s[3] + fmla v31.4s, v7.4s, v12.s[3] prfm PLDL1KEEP, [pB, #512] .endm .macro KERNEL16x4_M1 - fmla v16.4s, v0.4s, v8.4s[0] - fmla v17.4s, v2.4s, v8.4s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v2.4s, v8.s[0] ld1 {v12.4s}, [pB] // for next round add pB, pB, #16 - fmla v18.4s, v4.4s, v8.4s[0] - fmla v19.4s, v6.4s, v8.4s[0] + fmla v18.4s, v4.4s, v8.s[0] + fmla v19.4s, v6.4s, v8.s[0] ld1 {v1.4s}, [pA_0] // for next round add pA_0, pA_0, #16 - fmla v20.4s, v0.4s, v8.4s[1] - fmla v21.4s, v2.4s, v8.4s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v2.4s, v8.s[1] ld1 {v3.4s}, [pA_1] // for next round add pA_1, pA_1, #16 - fmla v22.4s, v4.4s, v8.4s[1] - fmla v23.4s, v6.4s, v8.4s[1] + fmla v22.4s, v4.4s, v8.s[1] + fmla v23.4s, v6.4s, v8.s[1] ld1 {v5.4s}, [pA_2] // for next round add pA_2, pA_2, #16 - fmla v24.4s, v0.4s, v8.4s[2] - fmla v25.4s, v2.4s, v8.4s[2] + fmla v24.4s, v0.4s, v8.s[2] + fmla v25.4s, v2.4s, v8.s[2] ld1 {v7.4s}, [pA_3] // for next round add pA_3, pA_3, #16 - fmla v26.4s, v4.4s, v8.4s[2] - fmla v27.4s, v6.4s, v8.4s[2] + fmla v26.4s, v4.4s, v8.s[2] + fmla v27.4s, v6.4s, v8.s[2] prfm PLDL1KEEP, [pA_0, #512] - fmla v28.4s, v0.4s, v8.4s[3] - fmla v29.4s, v2.4s, v8.4s[3] + fmla v28.4s, v0.4s, v8.s[3] + fmla v29.4s, v2.4s, v8.s[3] prfm PLDL1KEEP, [pA_1, #512] - fmla v30.4s, v4.4s, v8.4s[3] - fmla v31.4s, v6.4s, v8.4s[3] + fmla v30.4s, v4.4s, v8.s[3] + fmla v31.4s, v6.4s, v8.s[3] .endm .macro KERNEL16x4_E - fmla v16.4s, v1.4s, v12.4s[0] - fmla v17.4s, v3.4s, v12.4s[0] - fmla v18.4s, v5.4s, v12.4s[0] - fmla v19.4s, v7.4s, v12.4s[0] - fmla v20.4s, v1.4s, v12.4s[1] - fmla v21.4s, v3.4s, v12.4s[1] - fmla v22.4s, v5.4s, v12.4s[1] - fmla v23.4s, v7.4s, v12.4s[1] - fmla v24.4s, v1.4s, v12.4s[2] - fmla v25.4s, v3.4s, v12.4s[2] - fmla v26.4s, v5.4s, v12.4s[2] - fmla v27.4s, v7.4s, v12.4s[2] - fmla v28.4s, v1.4s, v12.4s[3] - fmla v29.4s, v3.4s, v12.4s[3] - fmla v30.4s, v5.4s, v12.4s[3] - fmla v31.4s, v7.4s, v12.4s[3] + fmla v16.4s, v1.4s, v12.s[0] + fmla v17.4s, v3.4s, v12.s[0] + fmla v18.4s, v5.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] + fmla v20.4s, v1.4s, v12.s[1] + fmla v21.4s, v3.4s, v12.s[1] + fmla v22.4s, v5.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] + fmla v24.4s, v1.4s, v12.s[2] + fmla v25.4s, v3.4s, v12.s[2] + fmla v26.4s, v5.4s, v12.s[2] + fmla v27.4s, v7.4s, v12.s[2] + fmla v28.4s, v1.4s, v12.s[3] + fmla v29.4s, v3.4s, v12.s[3] + fmla v30.4s, v5.4s, v12.s[3] + fmla v31.4s, v7.4s, v12.s[3] .endm .macro KERNEL16x4_SUB @@ -359,34 +359,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA_0] add pA_0, pA_0, #16 - fmla v16.4s, v0.4s, v8.4s[0] - fmla v20.4s, v0.4s, v8.4s[1] - fmla v24.4s, v0.4s, v8.4s[2] - fmla v28.4s, v0.4s, v8.4s[3] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v24.4s, v0.4s, v8.s[2] + fmla v28.4s, v0.4s, v8.s[3] ld1 {v2.4s}, [pA_1] add pA_1, pA_1, #16 - fmla v17.4s, v2.4s, v8.4s[0] - fmla v21.4s, v2.4s, v8.4s[1] - fmla v25.4s, v2.4s, v8.4s[2] - fmla v29.4s, v2.4s, v8.4s[3] + fmla v17.4s, v2.4s, v8.s[0] + fmla v21.4s, v2.4s, v8.s[1] + fmla v25.4s, v2.4s, v8.s[2] + fmla v29.4s, v2.4s, v8.s[3] ld1 {v4.4s}, [pA_2] add pA_2, pA_2, #16 - fmla v18.4s, v4.4s, v8.4s[0] - fmla v22.4s, v4.4s, v8.4s[1] - fmla v26.4s, v4.4s, v8.4s[2] - fmla v30.4s, v4.4s, v8.4s[3] + fmla v18.4s, v4.4s, v8.s[0] + fmla v22.4s, v4.4s, v8.s[1] + fmla v26.4s, v4.4s, v8.s[2] + fmla v30.4s, v4.4s, v8.s[3] ld1 {v6.4s}, [pA_3] add pA_3, pA_3, #16 - fmla v19.4s, v6.4s, v8.4s[0] - fmla v23.4s, v6.4s, v8.4s[1] - fmla v27.4s, v6.4s, v8.4s[2] - fmla v31.4s, v6.4s, v8.4s[3] + fmla v19.4s, v6.4s, v8.s[0] + fmla v23.4s, v6.4s, v8.s[1] + fmla v27.4s, v6.4s, v8.s[2] + fmla v31.4s, v6.4s, v8.s[3] .endm .macro SAVE16x4 @@ -456,28 +456,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA_0] add pA_0, pA_0, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v2.2s, v3.2s}, [pA_1] add pA_1, pA_1, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] - fmla v18.2s, v2.2s, v8.2s[0] - fmla v31.2s, v3.2s, v9.2s[1] - fmla v22.2s, v2.2s, v8.2s[1] - fmla v27.2s, v3.2s, v9.2s[0] + fmla v18.2s, v2.2s, v8.s[0] + fmla v31.2s, v3.2s, v9.s[1] + fmla v22.2s, v2.2s, v8.s[1] + fmla v27.2s, v3.2s, v9.s[0] - fmla v26.2s, v2.2s, v9.2s[0] - fmla v23.2s, v3.2s, v8.2s[1] - fmla v30.2s, v2.2s, v9.2s[1] - fmla v19.2s, v3.2s, v8.2s[0] + fmla v26.2s, v2.2s, v9.s[0] + fmla v23.2s, v3.2s, v8.s[1] + fmla v30.2s, v2.2s, v9.s[1] + fmla v19.2s, v3.2s, v8.s[0] .endm .macro SAVE8x4 @@ -556,17 +556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA_0] add pA_0, pA_0, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -614,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA_0] add pA_0, pA_0, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -700,10 +700,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA_0] add pA_0, pA_0, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -736,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA_0] add pA_0, pA_0, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -767,7 +767,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA_0] add pA_0, pA_0, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -796,8 +796,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA_0] add pA_0 , pA_0, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -825,7 +825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA_0] add pA_0 , pA_0, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/sgemm_kernel_8x8.S b/kernel/arm64/sgemm_kernel_8x8.S index ac690e4d4..bd47bed31 100644 --- a/kernel/arm64/sgemm_kernel_8x8.S +++ b/kernel/arm64/sgemm_kernel_8x8.S @@ -157,22 +157,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v4.4s[0] - fmul v17.4s, v1.4s, v4.4s[0] - fmul v18.4s, v0.4s, v4.4s[1] - fmul v19.4s, v1.4s, v4.4s[1] - fmul v20.4s, v0.4s, v4.4s[2] - fmul v21.4s, v1.4s, v4.4s[2] - fmul v22.4s, v0.4s, v4.4s[3] - fmul v23.4s, v1.4s, v4.4s[3] - fmul v24.4s, v0.4s, v5.4s[0] - fmul v25.4s, v1.4s, v5.4s[0] - fmul v26.4s, v0.4s, v5.4s[1] - fmul v27.4s, v1.4s, v5.4s[1] - fmul v28.4s, v0.4s, v5.4s[2] - fmul v29.4s, v1.4s, v5.4s[2] - fmul v30.4s, v0.4s, v5.4s[3] - fmul v31.4s, v1.4s, v5.4s[3] + fmul v16.4s, v0.4s, v4.s[0] + fmul v17.4s, v1.4s, v4.s[0] + fmul v18.4s, v0.4s, v4.s[1] + fmul v19.4s, v1.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v21.4s, v1.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v25.4s, v1.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v27.4s, v1.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v29.4s, v1.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] + fmul v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -185,22 +185,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_M1 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v17.4s, v1.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v19.4s, v1.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v21.4s, v1.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v23.4s, v1.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v25.4s, v1.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v27.4s, v1.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v29.4s, v1.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] - fmla v31.4s, v1.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -213,22 +213,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_M2 - fmla v16.4s, v2.4s, v6.4s[0] - fmla v17.4s, v3.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v19.4s, v3.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v21.4s, v3.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v23.4s, v3.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v25.4s, v3.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v27.4s, v3.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v29.4s, v3.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] - fmla v31.4s, v3.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 @@ -241,22 +241,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_E - fmla v16.4s, v2.4s, v6.4s[0] - fmla v17.4s, v3.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v19.4s, v3.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v21.4s, v3.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v23.4s, v3.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v25.4s, v3.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v27.4s, v3.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v29.4s, v3.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] - fmla v31.4s, v3.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] .endm .macro KERNEL8x8_SUB @@ -269,22 +269,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v17.4s, v1.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v19.4s, v1.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v21.4s, v1.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v23.4s, v1.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v25.4s, v1.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v27.4s, v1.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v29.4s, v1.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] - fmla v31.4s, v1.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] .endm .macro SAVE8x8 @@ -367,14 +367,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v4.4s[0] - fmul v18.4s, v0.4s, v4.4s[1] - fmul v20.4s, v0.4s, v4.4s[2] - fmul v22.4s, v0.4s, v4.4s[3] - fmul v24.4s, v0.4s, v5.4s[0] - fmul v26.4s, v0.4s, v5.4s[1] - fmul v28.4s, v0.4s, v5.4s[2] - fmul v30.4s, v0.4s, v5.4s[3] + fmul v16.4s, v0.4s, v4.s[0] + fmul v18.4s, v0.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -385,14 +385,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M1 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -403,14 +403,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 - fmla v16.4s, v2.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 @@ -421,14 +421,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_E - fmla v16.4s, v2.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] .endm .macro KERNEL4x8_SUB @@ -439,14 +439,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] .endm .macro SAVE4x8 @@ -520,14 +520,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v4.4s[0] - fmla v18.2s, v0.2s, v4.4s[1] - fmla v20.2s, v0.2s, v4.4s[2] - fmla v22.2s, v0.2s, v4.4s[3] - fmla v24.2s, v0.2s, v5.4s[0] - fmla v26.2s, v0.2s, v5.4s[1] - fmla v28.2s, v0.2s, v5.4s[2] - fmla v30.2s, v0.2s, v5.4s[3] + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] + fmla v24.2s, v0.2s, v5.s[0] + fmla v26.2s, v0.2s, v5.s[1] + fmla v28.2s, v0.2s, v5.s[2] + fmla v30.2s, v0.2s, v5.s[3] .endm .macro SAVE2x8 @@ -601,14 +601,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0, [pA] add pA, pA, #4 - fmla s16, s0, v4.4s[0] - fmla s18, s0, v4.4s[1] - fmla s20, s0, v4.4s[2] - fmla s22, s0, v4.4s[3] - fmla s24, s0, v5.4s[0] - fmla s26, s0, v5.4s[1] - fmla s28, s0, v5.4s[2] - fmla s30, s0, v5.4s[3] + fmla s16, s0, v4.s[0] + fmla s18, s0, v4.s[1] + fmla s20, s0, v4.s[2] + fmla s22, s0, v4.s[3] + fmla s24, s0, v5.s[0] + fmla s26, s0, v5.s[1] + fmla s28, s0, v5.s[2] + fmla s30, s0, v5.s[3] .endm .macro SAVE1x8 @@ -682,14 +682,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -700,14 +700,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -718,14 +718,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -736,14 +736,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] .endm .macro KERNEL8x4_SUB @@ -754,14 +754,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] .endm .macro SAVE8x4 @@ -814,17 +814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmul v16.2s, v0.2s, v8.2s[0] - fmul v29.2s, v1.2s, v9.2s[1] + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] - fmul v20.2s, v0.2s, v8.2s[1] - fmul v25.2s, v1.2s, v9.2s[0] + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] - fmul v24.2s, v0.2s, v9.2s[0] - fmul v21.2s, v1.2s, v8.2s[1] + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] - fmul v28.2s, v0.2s, v9.2s[1] - fmul v17.2s, v1.2s, v8.2s[0] + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -833,61 +833,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB @@ -896,17 +896,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -951,10 +951,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -1034,11 +1034,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 @@ -1074,10 +1074,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -1109,8 +1109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -1139,7 +1139,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA] add pA, pA, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -1169,8 +1169,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 @@ -1196,8 +1196,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -1222,7 +1222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA , pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S old mode 100755 new mode 100644 index b99760a03..28b321651 --- a/kernel/arm64/strmm_kernel_16x4.S +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -161,25 +161,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v18.4s, v2.4s, v8.2s[0] - fmul v19.4s, v3.4s, v8.2s[0] - - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v22.4s, v2.4s, v8.2s[1] - fmul v23.4s, v3.4s, v8.2s[1] - - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v26.4s, v2.4s, v9.2s[0] - fmul v27.4s, v3.4s, v9.2s[0] - - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] - fmul v30.4s, v2.4s, v9.2s[1] - fmul v31.4s, v3.4s, v9.2s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v18.4s, v2.4s, v8.s[0] + fmul v19.4s, v3.4s, v8.s[0] + + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v22.4s, v2.4s, v8.s[1] + fmul v23.4s, v3.4s, v8.s[1] + + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v26.4s, v2.4s, v9.s[0] + fmul v27.4s, v3.4s, v9.s[0] + + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] + fmul v30.4s, v2.4s, v9.s[1] + fmul v31.4s, v3.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -194,25 +194,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] - - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] - - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v26.4s, v2.4s, v9.2s[0] - fmla v27.4s, v3.4s, v9.2s[0] - - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] - fmla v30.4s, v2.4s, v9.2s[1] - fmla v31.4s, v3.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] + + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] + + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v26.4s, v2.4s, v9.s[0] + fmla v27.4s, v3.4s, v9.s[0] + + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] + fmla v30.4s, v2.4s, v9.s[1] + fmla v31.4s, v3.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -227,25 +227,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v18.4s, v6.4s, v12.2s[0] - fmla v19.4s, v7.4s, v12.2s[0] - - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v22.4s, v6.4s, v12.2s[1] - fmla v23.4s, v7.4s, v12.2s[1] - - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v26.4s, v6.4s, v13.2s[0] - fmla v27.4s, v7.4s, v13.2s[0] - - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] - fmla v30.4s, v6.4s, v13.2s[1] - fmla v31.4s, v7.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] + + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v22.4s, v6.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] + + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v26.4s, v6.4s, v13.s[0] + fmla v27.4s, v7.4s, v13.s[0] + + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] + fmla v30.4s, v6.4s, v13.s[1] + fmla v31.4s, v7.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -260,25 +260,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v18.4s, v6.4s, v12.2s[0] - fmla v19.4s, v7.4s, v12.2s[0] - - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v22.4s, v6.4s, v12.2s[1] - fmla v23.4s, v7.4s, v12.2s[1] - - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v26.4s, v6.4s, v13.2s[0] - fmla v27.4s, v7.4s, v13.2s[0] - - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] - fmla v30.4s, v6.4s, v13.2s[1] - fmla v31.4s, v7.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v18.4s, v6.4s, v12.s[0] + fmla v19.4s, v7.4s, v12.s[0] + + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v22.4s, v6.4s, v12.s[1] + fmla v23.4s, v7.4s, v12.s[1] + + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v26.4s, v6.4s, v13.s[0] + fmla v27.4s, v7.4s, v13.s[0] + + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] + fmla v30.4s, v6.4s, v13.s[1] + fmla v31.4s, v7.4s, v13.s[1] .endm .macro KERNEL16x4_SUB @@ -293,25 +293,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] - - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] - - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v26.4s, v2.4s, v9.2s[0] - fmla v27.4s, v3.4s, v9.2s[0] - - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] - fmla v30.4s, v2.4s, v9.2s[1] - fmla v31.4s, v3.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] + + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] + + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v26.4s, v2.4s, v9.s[0] + fmla v27.4s, v3.4s, v9.s[0] + + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] + fmla v30.4s, v2.4s, v9.s[1] + fmla v31.4s, v3.4s, v9.s[1] .endm .macro SAVE16x4 @@ -369,14 +369,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -387,14 +387,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -405,14 +405,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -423,14 +423,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] .endm .macro KERNEL8x4_SUB @@ -441,14 +441,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] .endm .macro SAVE8x4 @@ -496,17 +496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmul v16.2s, v0.2s, v8.2s[0] - fmul v29.2s, v1.2s, v9.2s[1] + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] - fmul v20.2s, v0.2s, v8.2s[1] - fmul v25.2s, v1.2s, v9.2s[0] + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] - fmul v24.2s, v0.2s, v9.2s[0] - fmul v21.2s, v1.2s, v8.2s[1] + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] - fmul v28.2s, v0.2s, v9.2s[1] - fmul v17.2s, v1.2s, v8.2s[0] + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -515,61 +515,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB @@ -578,17 +578,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -633,10 +633,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -718,15 +718,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v22.4s, v2.4s, v8.2s[1] - fmla v23.4s, v3.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v22.4s, v2.4s, v8.s[1] + fmla v23.4s, v3.4s, v8.s[1] .endm .macro SAVE16x2 @@ -764,11 +764,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 @@ -802,10 +802,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -837,8 +837,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -866,7 +866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA] add pA, pA, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -901,10 +901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v3.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v18.4s, v2.4s, v8.2s[0] - fmla v19.4s, v3.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v19.4s, v3.4s, v8.s[0] .endm .macro SAVE16x1 @@ -934,8 +934,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 @@ -961,8 +961,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -987,7 +987,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA , pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/strmm_kernel_4x4.S b/kernel/arm64/strmm_kernel_4x4.S index 674e200d8..eeb3e6e72 100644 --- a/kernel/arm64/strmm_kernel_4x4.S +++ b/kernel/arm64/strmm_kernel_4x4.S @@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmul v16.2s, v0.2s, v8.2s[0] - fmul v29.2s, v1.2s, v9.2s[1] + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] - fmul v20.2s, v0.2s, v8.2s[1] - fmul v25.2s, v1.2s, v9.2s[0] + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] - fmul v24.2s, v0.2s, v9.2s[0] - fmul v21.2s, v1.2s, v8.2s[1] + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] - fmul v28.2s, v0.2s, v9.2s[1] - fmul v17.2s, v1.2s, v8.2s[0] + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB @@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -280,10 +280,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -353,10 +353,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -386,8 +386,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -414,7 +414,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA] add pA, pA, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -440,8 +440,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -468,7 +468,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA , pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/strmm_kernel_8x8.S b/kernel/arm64/strmm_kernel_8x8.S old mode 100755 new mode 100644 index 98b912934..843f0c890 --- a/kernel/arm64/strmm_kernel_8x8.S +++ b/kernel/arm64/strmm_kernel_8x8.S @@ -159,22 +159,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v4.4s[0] - fmul v17.4s, v1.4s, v4.4s[0] - fmul v18.4s, v0.4s, v4.4s[1] - fmul v19.4s, v1.4s, v4.4s[1] - fmul v20.4s, v0.4s, v4.4s[2] - fmul v21.4s, v1.4s, v4.4s[2] - fmul v22.4s, v0.4s, v4.4s[3] - fmul v23.4s, v1.4s, v4.4s[3] - fmul v24.4s, v0.4s, v5.4s[0] - fmul v25.4s, v1.4s, v5.4s[0] - fmul v26.4s, v0.4s, v5.4s[1] - fmul v27.4s, v1.4s, v5.4s[1] - fmul v28.4s, v0.4s, v5.4s[2] - fmul v29.4s, v1.4s, v5.4s[2] - fmul v30.4s, v0.4s, v5.4s[3] - fmul v31.4s, v1.4s, v5.4s[3] + fmul v16.4s, v0.4s, v4.s[0] + fmul v17.4s, v1.4s, v4.s[0] + fmul v18.4s, v0.4s, v4.s[1] + fmul v19.4s, v1.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v21.4s, v1.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v23.4s, v1.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v25.4s, v1.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v27.4s, v1.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v29.4s, v1.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] + fmul v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -187,22 +187,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_M1 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v17.4s, v1.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v19.4s, v1.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v21.4s, v1.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v23.4s, v1.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v25.4s, v1.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v27.4s, v1.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v29.4s, v1.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] - fmla v31.4s, v1.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -215,22 +215,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_M2 - fmla v16.4s, v2.4s, v6.4s[0] - fmla v17.4s, v3.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v19.4s, v3.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v21.4s, v3.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v23.4s, v3.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v25.4s, v3.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v27.4s, v3.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v29.4s, v3.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] - fmla v31.4s, v3.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 @@ -243,22 +243,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x8_E - fmla v16.4s, v2.4s, v6.4s[0] - fmla v17.4s, v3.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v19.4s, v3.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v21.4s, v3.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v23.4s, v3.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v25.4s, v3.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v27.4s, v3.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v29.4s, v3.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] - fmla v31.4s, v3.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v17.4s, v3.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v19.4s, v3.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v21.4s, v3.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v23.4s, v3.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v25.4s, v3.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v27.4s, v3.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v29.4s, v3.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] + fmla v31.4s, v3.4s, v7.s[3] .endm .macro KERNEL8x8_SUB @@ -271,22 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v17.4s, v1.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v19.4s, v1.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v21.4s, v1.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v23.4s, v1.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v25.4s, v1.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v27.4s, v1.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v29.4s, v1.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] - fmla v31.4s, v1.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v17.4s, v1.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v19.4s, v1.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v21.4s, v1.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v23.4s, v1.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v25.4s, v1.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v27.4s, v1.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v29.4s, v1.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] + fmla v31.4s, v1.4s, v5.s[3] .endm .macro SAVE8x8 @@ -361,14 +361,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v4.4s[0] - fmul v18.4s, v0.4s, v4.4s[1] - fmul v20.4s, v0.4s, v4.4s[2] - fmul v22.4s, v0.4s, v4.4s[3] - fmul v24.4s, v0.4s, v5.4s[0] - fmul v26.4s, v0.4s, v5.4s[1] - fmul v28.4s, v0.4s, v5.4s[2] - fmul v30.4s, v0.4s, v5.4s[3] + fmul v16.4s, v0.4s, v4.s[0] + fmul v18.4s, v0.4s, v4.s[1] + fmul v20.4s, v0.4s, v4.s[2] + fmul v22.4s, v0.4s, v4.s[3] + fmul v24.4s, v0.4s, v5.s[0] + fmul v26.4s, v0.4s, v5.s[1] + fmul v28.4s, v0.4s, v5.s[2] + fmul v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -379,14 +379,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M1 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] ld1 {v6.4s}, [pB] add pB, pB, #16 @@ -397,14 +397,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_M2 - fmla v16.4s, v2.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] ld1 {v4.4s}, [pB] add pB, pB, #16 @@ -415,14 +415,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x8_E - fmla v16.4s, v2.4s, v6.4s[0] - fmla v18.4s, v2.4s, v6.4s[1] - fmla v20.4s, v2.4s, v6.4s[2] - fmla v22.4s, v2.4s, v6.4s[3] - fmla v24.4s, v2.4s, v7.4s[0] - fmla v26.4s, v2.4s, v7.4s[1] - fmla v28.4s, v2.4s, v7.4s[2] - fmla v30.4s, v2.4s, v7.4s[3] + fmla v16.4s, v2.4s, v6.s[0] + fmla v18.4s, v2.4s, v6.s[1] + fmla v20.4s, v2.4s, v6.s[2] + fmla v22.4s, v2.4s, v6.s[3] + fmla v24.4s, v2.4s, v7.s[0] + fmla v26.4s, v2.4s, v7.s[1] + fmla v28.4s, v2.4s, v7.s[2] + fmla v30.4s, v2.4s, v7.s[3] .endm .macro KERNEL4x8_SUB @@ -433,14 +433,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v4.4s[0] - fmla v18.4s, v0.4s, v4.4s[1] - fmla v20.4s, v0.4s, v4.4s[2] - fmla v22.4s, v0.4s, v4.4s[3] - fmla v24.4s, v0.4s, v5.4s[0] - fmla v26.4s, v0.4s, v5.4s[1] - fmla v28.4s, v0.4s, v5.4s[2] - fmla v30.4s, v0.4s, v5.4s[3] + fmla v16.4s, v0.4s, v4.s[0] + fmla v18.4s, v0.4s, v4.s[1] + fmla v20.4s, v0.4s, v4.s[2] + fmla v22.4s, v0.4s, v4.s[3] + fmla v24.4s, v0.4s, v5.s[0] + fmla v26.4s, v0.4s, v5.s[1] + fmla v28.4s, v0.4s, v5.s[2] + fmla v30.4s, v0.4s, v5.s[3] .endm .macro SAVE4x8 @@ -514,14 +514,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v4.4s[0] - fmla v18.2s, v0.2s, v4.4s[1] - fmla v20.2s, v0.2s, v4.4s[2] - fmla v22.2s, v0.2s, v4.4s[3] - fmla v24.2s, v0.2s, v5.4s[0] - fmla v26.2s, v0.2s, v5.4s[1] - fmla v28.2s, v0.2s, v5.4s[2] - fmla v30.2s, v0.2s, v5.4s[3] + fmla v16.2s, v0.2s, v4.s[0] + fmla v18.2s, v0.2s, v4.s[1] + fmla v20.2s, v0.2s, v4.s[2] + fmla v22.2s, v0.2s, v4.s[3] + fmla v24.2s, v0.2s, v5.s[0] + fmla v26.2s, v0.2s, v5.s[1] + fmla v28.2s, v0.2s, v5.s[2] + fmla v30.2s, v0.2s, v5.s[3] .endm .macro SAVE2x8 @@ -595,14 +595,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0, [pA] add pA, pA, #4 - fmla s16, s0, v4.4s[0] - fmla s18, s0, v4.4s[1] - fmla s20, s0, v4.4s[2] - fmla s22, s0, v4.4s[3] - fmla s24, s0, v5.4s[0] - fmla s26, s0, v5.4s[1] - fmla s28, s0, v5.4s[2] - fmla s30, s0, v5.4s[3] + fmla s16, s0, v4.s[0] + fmla s18, s0, v4.s[1] + fmla s20, s0, v4.s[2] + fmla s22, s0, v4.s[3] + fmla s24, s0, v5.s[0] + fmla s26, s0, v5.s[1] + fmla s28, s0, v5.s[2] + fmla s30, s0, v5.s[3] .endm .macro SAVE1x8 @@ -676,14 +676,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmul v16.4s, v0.4s, v8.2s[0] - fmul v17.4s, v1.4s, v8.2s[0] - fmul v20.4s, v0.4s, v8.2s[1] - fmul v21.4s, v1.4s, v8.2s[1] - fmul v24.4s, v0.4s, v9.2s[0] - fmul v25.4s, v1.4s, v9.2s[0] - fmul v28.4s, v0.4s, v9.2s[1] - fmul v29.4s, v1.4s, v9.2s[1] + fmul v16.4s, v0.4s, v8.s[0] + fmul v17.4s, v1.4s, v8.s[0] + fmul v20.4s, v0.4s, v8.s[1] + fmul v21.4s, v1.4s, v8.s[1] + fmul v24.4s, v0.4s, v9.s[0] + fmul v25.4s, v1.4s, v9.s[0] + fmul v28.4s, v0.4s, v9.s[1] + fmul v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -694,14 +694,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M1 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -712,14 +712,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_M2 - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] add pB, pB, #16 @@ -730,14 +730,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_E - fmla v16.4s, v4.4s, v12.2s[0] - fmla v17.4s, v5.4s, v12.2s[0] - fmla v20.4s, v4.4s, v12.2s[1] - fmla v21.4s, v5.4s, v12.2s[1] - fmla v24.4s, v4.4s, v13.2s[0] - fmla v25.4s, v5.4s, v13.2s[0] - fmla v28.4s, v4.4s, v13.2s[1] - fmla v29.4s, v5.4s, v13.2s[1] + fmla v16.4s, v4.4s, v12.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v20.4s, v4.4s, v12.s[1] + fmla v21.4s, v5.4s, v12.s[1] + fmla v24.4s, v4.4s, v13.s[0] + fmla v25.4s, v5.4s, v13.s[0] + fmla v28.4s, v4.4s, v13.s[1] + fmla v29.4s, v5.4s, v13.s[1] .endm .macro KERNEL8x4_SUB @@ -748,14 +748,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] - fmla v24.4s, v0.4s, v9.2s[0] - fmla v25.4s, v1.4s, v9.2s[0] - fmla v28.4s, v0.4s, v9.2s[1] - fmla v29.4s, v1.4s, v9.2s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] + fmla v24.4s, v0.4s, v9.s[0] + fmla v25.4s, v1.4s, v9.s[0] + fmla v28.4s, v0.4s, v9.s[1] + fmla v29.4s, v1.4s, v9.s[1] .endm .macro SAVE8x4 @@ -808,17 +808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmul v16.2s, v0.2s, v8.2s[0] - fmul v29.2s, v1.2s, v9.2s[1] + fmul v16.2s, v0.2s, v8.s[0] + fmul v29.2s, v1.2s, v9.s[1] - fmul v20.2s, v0.2s, v8.2s[1] - fmul v25.2s, v1.2s, v9.2s[0] + fmul v20.2s, v0.2s, v8.s[1] + fmul v25.2s, v1.2s, v9.s[0] - fmul v24.2s, v0.2s, v9.2s[0] - fmul v21.2s, v1.2s, v8.2s[1] + fmul v24.2s, v0.2s, v9.s[0] + fmul v21.2s, v1.2s, v8.s[1] - fmul v28.2s, v0.2s, v9.2s[1] - fmul v17.2s, v1.2s, v8.2s[0] + fmul v28.2s, v0.2s, v9.s[1] + fmul v17.2s, v1.2s, v8.s[0] ld1 {v12.2s, v13.2s}, [pB] add pB, pB, #16 @@ -827,61 +827,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] ld1 {v12.2s, v13.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] ld1 {v4.2s, v5.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] prfm PLDL1KEEP, [pB, #512] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] ld1 {v8.2s, v9.2s}, [pB] // For next round add pB, pB, #16 - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] ld1 {v0.2s, v1.2s}, [pA] // For next round add pA, pA, #16 - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] prfm PLDL1KEEP, [pA, #512] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.2s[0] - fmla v29.2s, v5.2s, v13.2s[1] + fmla v16.2s, v4.2s, v12.s[0] + fmla v29.2s, v5.2s, v13.s[1] - fmla v20.2s, v4.2s, v12.2s[1] - fmla v25.2s, v5.2s, v13.2s[0] + fmla v20.2s, v4.2s, v12.s[1] + fmla v25.2s, v5.2s, v13.s[0] - fmla v24.2s, v4.2s, v13.2s[0] - fmla v21.2s, v5.2s, v12.2s[1] + fmla v24.2s, v4.2s, v13.s[0] + fmla v21.2s, v5.2s, v12.s[1] - fmla v28.2s, v4.2s, v13.2s[1] - fmla v17.2s, v5.2s, v12.2s[0] + fmla v28.2s, v4.2s, v13.s[1] + fmla v17.2s, v5.2s, v12.s[0] .endm .macro KERNEL4x4_SUB @@ -890,17 +890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v29.2s, v1.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v29.2s, v1.2s, v9.s[1] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v25.2s, v1.2s, v9.2s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v25.2s, v1.2s, v9.s[0] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v21.2s, v1.2s, v8.s[1] - fmla v28.2s, v0.2s, v9.2s[1] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v28.2s, v0.2s, v9.s[1] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x4 @@ -945,10 +945,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v24.2s, v0.2s, v9.2s[0] - fmla v28.2s, v0.2s, v9.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v24.2s, v0.2s, v9.s[0] + fmla v28.2s, v0.2s, v9.s[1] .endm .macro SAVE2x4 @@ -1028,11 +1028,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.2s[1] - fmla v21.4s, v1.4s, v8.2s[1] + fmla v20.4s, v0.4s, v8.s[1] + fmla v21.4s, v1.4s, v8.s[1] .endm .macro SAVE8x2 @@ -1068,10 +1068,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA, pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] - fmla v21.2s, v1.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] + fmla v21.2s, v1.2s, v8.s[1] .endm .macro SAVE4x2 @@ -1103,8 +1103,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA, pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v20.2s, v0.2s, v8.2s[1] + fmla v16.2s, v0.2s, v8.s[0] + fmla v20.2s, v0.2s, v8.s[1] .endm .macro SAVE2x2 @@ -1133,7 +1133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ldr s0 , [pA] add pA, pA, #4 - fmla v16.2s, v8.2s, v0.2s[0] + fmla v16.2s, v8.2s, v0.s[0] .endm .macro SAVE1x2 @@ -1163,8 +1163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v1.4s}, [pA] add pA, pA, #16 - fmla v16.4s, v0.4s, v8.2s[0] - fmla v17.4s, v1.4s, v8.2s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v17.4s, v1.4s, v8.s[0] .endm .macro SAVE8x1 @@ -1190,8 +1190,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s, v1.2s}, [pA] add pA , pA, #16 - fmla v16.2s, v0.2s, v8.2s[0] - fmla v17.2s, v1.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] + fmla v17.2s, v1.2s, v8.s[0] .endm .macro SAVE4x1 @@ -1216,7 +1216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v0.2s}, [pA] add pA , pA, #8 - fmla v16.2s, v0.2s, v8.2s[0] + fmla v16.2s, v0.2s, v8.s[0] .endm .macro SAVE2x1 diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S index 28ce3de40..1cb695e56 100644 --- a/kernel/arm64/zgemm_kernel_4x4.S +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -182,93 +182,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] + fmul v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.2d, v0.2d, v9.2d[0] + fmls v17.2d, v0.2d, v9.d[0] #else - fmul v17.2d, v0.2d, v9.2d[0] + fmul v17.2d, v0.2d, v9.d[0] #endif - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - fmul v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] + fmul v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b - fmls v19.2d, v2.2d, v9.2d[0] + fmls v19.2d, v2.2d, v9.d[0] #else - fmul v19.2d, v2.2d, v9.2d[0] + fmul v19.2d, v2.2d, v9.d[0] #endif - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_ir v19.2d, v3.2d, v8.d[0] - fmul v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] + fmul v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.2d, v0.2d, v9.2d[1] + fmls v21.2d, v0.2d, v9.d[1] #else - fmul v21.2d, v0.2d, v9.2d[1] + fmul v21.2d, v0.2d, v9.d[1] #endif - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - fmul v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] + fmul v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b - fmls v23.2d, v2.2d, v9.2d[1] + fmls v23.2d, v2.2d, v9.d[1] #else - fmul v23.2d, v2.2d, v9.2d[1] + fmul v23.2d, v2.2d, v9.d[1] #endif - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_ir v23.2d, v3.2d, v8.d[1] - fmul v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] + fmul v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.2d, v0.2d, v11.2d[0] + fmls v25.2d, v0.2d, v11.d[0] #else - fmul v25.2d, v0.2d, v11.2d[0] + fmul v25.2d, v0.2d, v11.d[0] #endif - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_ir v25.2d, v1.2d, v10.d[0] - fmul v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] + fmul v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.2d, v2.2d, v11.2d[0] + fmls v27.2d, v2.2d, v11.d[0] #else - fmul v27.2d, v2.2d, v11.2d[0] + fmul v27.2d, v2.2d, v11.d[0] #endif - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_ir v27.2d, v3.2d, v10.d[0] - fmul v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] + fmul v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.2d, v0.2d, v11.2d[1] + fmls v29.2d, v0.2d, v11.d[1] #else - fmul v29.2d, v0.2d, v11.2d[1] + fmul v29.2d, v0.2d, v11.d[1] #endif - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - fmul v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] + fmul v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.2d, v2.2d, v11.2d[1] + fmls v31.2d, v2.2d, v11.d[1] #else - fmul v31.2d, v2.2d, v11.2d[1] + fmul v31.2d, v2.2d, v11.d[1] #endif - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_ir v31.2d, v3.2d, v10.d[1] ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -281,161 +281,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] ld2 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] ld2 {v14.2d, v15.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] ld2 {v4.2d, v5.2d} , [pA] // For next round add pA, pA, #32 - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] ld2 {v6.2d, v7.2d} , [pA] // For next round add pA, pA, #32 - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #512] - OP_rr v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] - OP_ri v27.2d, v2.2d, v11.2d[0] - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_rr v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] + OP_ri v27.2d, v2.2d, v11.d[0] + OP_ir v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - OP_rr v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] - OP_ri v31.2d, v2.2d, v11.2d[1] - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_rr v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] + OP_ri v31.2d, v2.2d, v11.d[1] + OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro KERNEL4x4_M2 - OP_rr v16.2d, v4.2d, v12.2d[0] - OP_ii v16.2d, v5.2d, v13.2d[0] - OP_ri v17.2d, v4.2d, v13.2d[0] - OP_ir v17.2d, v5.2d, v12.2d[0] + OP_rr v16.2d, v4.2d, v12.d[0] + OP_ii v16.2d, v5.2d, v13.d[0] + OP_ri v17.2d, v4.2d, v13.d[0] + OP_ir v17.2d, v5.2d, v12.d[0] ld2 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v18.2d, v6.2d, v12.2d[0] - OP_ii v18.2d, v7.2d, v13.2d[0] - OP_ri v19.2d, v6.2d, v13.2d[0] - OP_ir v19.2d, v7.2d, v12.2d[0] + OP_rr v18.2d, v6.2d, v12.d[0] + OP_ii v18.2d, v7.2d, v13.d[0] + OP_ri v19.2d, v6.2d, v13.d[0] + OP_ir v19.2d, v7.2d, v12.d[0] ld2 {v10.2d, v11.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v20.2d, v4.2d, v12.2d[1] - OP_ii v20.2d, v5.2d, v13.2d[1] - OP_ri v21.2d, v4.2d, v13.2d[1] - OP_ir v21.2d, v5.2d, v12.2d[1] + OP_rr v20.2d, v4.2d, v12.d[1] + OP_ii v20.2d, v5.2d, v13.d[1] + OP_ri v21.2d, v4.2d, v13.d[1] + OP_ir v21.2d, v5.2d, v12.d[1] ld2 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 - OP_rr v22.2d, v6.2d, v12.2d[1] - OP_ii v22.2d, v7.2d, v13.2d[1] - OP_ri v23.2d, v6.2d, v13.2d[1] - OP_ir v23.2d, v7.2d, v12.2d[1] + OP_rr v22.2d, v6.2d, v12.d[1] + OP_ii v22.2d, v7.2d, v13.d[1] + OP_ri v23.2d, v6.2d, v13.d[1] + OP_ir v23.2d, v7.2d, v12.d[1] ld2 {v2.2d, v3.2d}, [pA] // For next round add pA, pA, #32 - OP_rr v24.2d, v4.2d, v14.2d[0] - OP_ii v24.2d, v5.2d, v15.2d[0] - OP_ri v25.2d, v4.2d, v15.2d[0] - OP_ir v25.2d, v5.2d, v14.2d[0] + OP_rr v24.2d, v4.2d, v14.d[0] + OP_ii v24.2d, v5.2d, v15.d[0] + OP_ri v25.2d, v4.2d, v15.d[0] + OP_ir v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pA, #512] - OP_rr v26.2d, v6.2d, v14.2d[0] - OP_ii v26.2d, v7.2d, v15.2d[0] - OP_ri v27.2d, v6.2d, v15.2d[0] - OP_ir v27.2d, v7.2d, v14.2d[0] + OP_rr v26.2d, v6.2d, v14.d[0] + OP_ii v26.2d, v7.2d, v15.d[0] + OP_ri v27.2d, v6.2d, v15.d[0] + OP_ir v27.2d, v7.2d, v14.d[0] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.2d, v4.2d, v14.2d[1] - OP_ii v28.2d, v5.2d, v15.2d[1] - OP_ri v29.2d, v4.2d, v15.2d[1] - OP_ir v29.2d, v5.2d, v14.2d[1] + OP_rr v28.2d, v4.2d, v14.d[1] + OP_ii v28.2d, v5.2d, v15.d[1] + OP_ri v29.2d, v4.2d, v15.d[1] + OP_ir v29.2d, v5.2d, v14.d[1] - OP_rr v30.2d, v6.2d, v14.2d[1] - OP_ii v30.2d, v7.2d, v15.2d[1] - OP_ri v31.2d, v6.2d, v15.2d[1] - OP_ir v31.2d, v7.2d, v14.2d[1] + OP_rr v30.2d, v6.2d, v14.d[1] + OP_ii v30.2d, v7.2d, v15.d[1] + OP_ri v31.2d, v6.2d, v15.d[1] + OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_E - OP_rr v16.2d, v4.2d, v12.2d[0] - OP_ii v16.2d, v5.2d, v13.2d[0] - OP_ri v17.2d, v4.2d, v13.2d[0] - OP_ir v17.2d, v5.2d, v12.2d[0] - - OP_rr v18.2d, v6.2d, v12.2d[0] - OP_ii v18.2d, v7.2d, v13.2d[0] - OP_ri v19.2d, v6.2d, v13.2d[0] - OP_ir v19.2d, v7.2d, v12.2d[0] - - OP_rr v20.2d, v4.2d, v12.2d[1] - OP_ii v20.2d, v5.2d, v13.2d[1] - OP_ri v21.2d, v4.2d, v13.2d[1] - OP_ir v21.2d, v5.2d, v12.2d[1] - - OP_rr v22.2d, v6.2d, v12.2d[1] - OP_ii v22.2d, v7.2d, v13.2d[1] - OP_ri v23.2d, v6.2d, v13.2d[1] - OP_ir v23.2d, v7.2d, v12.2d[1] - - OP_rr v24.2d, v4.2d, v14.2d[0] - OP_ii v24.2d, v5.2d, v15.2d[0] - OP_ri v25.2d, v4.2d, v15.2d[0] - OP_ir v25.2d, v5.2d, v14.2d[0] - - OP_rr v26.2d, v6.2d, v14.2d[0] - OP_ii v26.2d, v7.2d, v15.2d[0] - OP_ri v27.2d, v6.2d, v15.2d[0] - OP_ir v27.2d, v7.2d, v14.2d[0] - - OP_rr v28.2d, v4.2d, v14.2d[1] - OP_ii v28.2d, v5.2d, v15.2d[1] - OP_ri v29.2d, v4.2d, v15.2d[1] - OP_ir v29.2d, v5.2d, v14.2d[1] - - OP_rr v30.2d, v6.2d, v14.2d[1] - OP_ii v30.2d, v7.2d, v15.2d[1] - OP_ri v31.2d, v6.2d, v15.2d[1] - OP_ir v31.2d, v7.2d, v14.2d[1] + OP_rr v16.2d, v4.2d, v12.d[0] + OP_ii v16.2d, v5.2d, v13.d[0] + OP_ri v17.2d, v4.2d, v13.d[0] + OP_ir v17.2d, v5.2d, v12.d[0] + + OP_rr v18.2d, v6.2d, v12.d[0] + OP_ii v18.2d, v7.2d, v13.d[0] + OP_ri v19.2d, v6.2d, v13.d[0] + OP_ir v19.2d, v7.2d, v12.d[0] + + OP_rr v20.2d, v4.2d, v12.d[1] + OP_ii v20.2d, v5.2d, v13.d[1] + OP_ri v21.2d, v4.2d, v13.d[1] + OP_ir v21.2d, v5.2d, v12.d[1] + + OP_rr v22.2d, v6.2d, v12.d[1] + OP_ii v22.2d, v7.2d, v13.d[1] + OP_ri v23.2d, v6.2d, v13.d[1] + OP_ir v23.2d, v7.2d, v12.d[1] + + OP_rr v24.2d, v4.2d, v14.d[0] + OP_ii v24.2d, v5.2d, v15.d[0] + OP_ri v25.2d, v4.2d, v15.d[0] + OP_ir v25.2d, v5.2d, v14.d[0] + + OP_rr v26.2d, v6.2d, v14.d[0] + OP_ii v26.2d, v7.2d, v15.d[0] + OP_ri v27.2d, v6.2d, v15.d[0] + OP_ir v27.2d, v7.2d, v14.d[0] + + OP_rr v28.2d, v4.2d, v14.d[1] + OP_ii v28.2d, v5.2d, v15.d[1] + OP_ri v29.2d, v4.2d, v15.d[1] + OP_ir v29.2d, v5.2d, v14.d[1] + + OP_rr v30.2d, v6.2d, v14.d[1] + OP_ii v30.2d, v7.2d, v15.d[1] + OP_ri v31.2d, v6.2d, v15.d[1] + OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_SUB @@ -448,45 +448,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] - - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] - - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] - - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] - - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] - - OP_rr v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] - OP_ri v27.2d, v2.2d, v11.2d[0] - OP_ir v27.2d, v3.2d, v10.2d[0] - - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] - - OP_rr v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] - OP_ri v31.2d, v2.2d, v11.2d[1] - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] + + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] + + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] + + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] + + OP_rr v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] + OP_ri v27.2d, v2.2d, v11.d[0] + OP_ir v27.2d, v3.2d, v10.d[0] + + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] + + OP_rr v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] + OP_ri v31.2d, v2.2d, v11.d[1] + OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro SAVE4x4 @@ -582,25 +582,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] - - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] - - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] - - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] + + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] + + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] + + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] .endm .macro SAVE2x4 @@ -669,25 +669,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 - OP_rr d16, d0, v8.2d[0] - OP_ii d16, d1, v9.2d[0] - OP_ri d17, d0, v9.2d[0] - OP_ir d17, d1, v8.2d[0] - - OP_rr d20, d0, v8.2d[1] - OP_ii d20, d1, v9.2d[1] - OP_ri d21, d0, v9.2d[1] - OP_ir d21, d1, v8.2d[1] - - OP_rr d24, d0, v10.2d[0] - OP_ii d24, d1, v11.2d[0] - OP_ri d25, d0, v11.2d[0] - OP_ir d25, d1, v10.2d[0] - - OP_rr d28, d0, v10.2d[1] - OP_ii d28, d1, v11.2d[1] - OP_ri d29, d0, v11.2d[1] - OP_ir d29, d1, v10.2d[1] + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] + + OP_rr d20, d0, v8.d[1] + OP_ii d20, d1, v9.d[1] + OP_ri d21, d0, v9.d[1] + OP_ir d21, d1, v8.d[1] + + OP_rr d24, d0, v10.d[0] + OP_ii d24, d1, v11.d[0] + OP_ri d25, d0, v11.d[0] + OP_ir d25, d1, v10.d[0] + + OP_rr d28, d0, v10.d[1] + OP_ii d28, d1, v11.d[1] + OP_ri d29, d0, v11.d[1] + OP_ir d29, d1, v10.d[1] .endm .macro SAVE1x4 @@ -756,25 +756,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] - - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] - - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] - - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] + + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] + + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] .endm .macro SAVE4x2 @@ -833,15 +833,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] .endm .macro SAVE2x2 @@ -886,15 +886,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 - OP_rr d16, d0, v8.2d[0] - OP_ii d16, d1, v9.2d[0] - OP_ri d17, d0, v9.2d[0] - OP_ir d17, d1, v8.2d[0] + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] - OP_rr d20, d0, v8.2d[1] - OP_ii d20, d1, v9.2d[1] - OP_ri d21, d0, v9.2d[1] - OP_ir d21, d1, v8.2d[1] + OP_rr d20, d0, v8.d[1] + OP_ii d20, d1, v9.d[1] + OP_ri d21, d0, v9.d[1] + OP_ir d21, d1, v8.d[1] .endm .macro SAVE1x2 diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S index 3ff8227e3..7945870d6 100644 --- a/kernel/arm64/ztrmm_kernel_4x4.S +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -185,93 +185,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - fmul v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] + fmul v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v17.16b, v17.16b, v17.16b - fmls v17.2d, v0.2d, v9.2d[0] + fmls v17.2d, v0.2d, v9.d[0] #else - fmul v17.2d, v0.2d, v9.2d[0] + fmul v17.2d, v0.2d, v9.d[0] #endif - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - fmul v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] + fmul v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v19.16b, v19.16b, v19.16b - fmls v19.2d, v2.2d, v9.2d[0] + fmls v19.2d, v2.2d, v9.d[0] #else - fmul v19.2d, v2.2d, v9.2d[0] + fmul v19.2d, v2.2d, v9.d[0] #endif - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_ir v19.2d, v3.2d, v8.d[0] - fmul v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] + fmul v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v21.16b, v21.16b, v21.16b - fmls v21.2d, v0.2d, v9.2d[1] + fmls v21.2d, v0.2d, v9.d[1] #else - fmul v21.2d, v0.2d, v9.2d[1] + fmul v21.2d, v0.2d, v9.d[1] #endif - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_ir v21.2d, v1.2d, v8.d[1] - fmul v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] + fmul v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v23.16b, v23.16b, v23.16b - fmls v23.2d, v2.2d, v9.2d[1] + fmls v23.2d, v2.2d, v9.d[1] #else - fmul v23.2d, v2.2d, v9.2d[1] + fmul v23.2d, v2.2d, v9.d[1] #endif - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_ir v23.2d, v3.2d, v8.d[1] - fmul v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] + fmul v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.2d, v0.2d, v11.2d[0] + fmls v25.2d, v0.2d, v11.d[0] #else - fmul v25.2d, v0.2d, v11.2d[0] + fmul v25.2d, v0.2d, v11.d[0] #endif - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_ir v25.2d, v1.2d, v10.d[0] - fmul v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] + fmul v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.2d, v2.2d, v11.2d[0] + fmls v27.2d, v2.2d, v11.d[0] #else - fmul v27.2d, v2.2d, v11.2d[0] + fmul v27.2d, v2.2d, v11.d[0] #endif - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_ir v27.2d, v3.2d, v10.d[0] - fmul v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] + fmul v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.2d, v0.2d, v11.2d[1] + fmls v29.2d, v0.2d, v11.d[1] #else - fmul v29.2d, v0.2d, v11.2d[1] + fmul v29.2d, v0.2d, v11.d[1] #endif - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - fmul v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] + fmul v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.2d, v2.2d, v11.2d[1] + fmls v31.2d, v2.2d, v11.d[1] #else - fmul v31.2d, v2.2d, v11.2d[1] + fmul v31.2d, v2.2d, v11.d[1] #endif - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_ir v31.2d, v3.2d, v10.d[1] ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 @@ -284,161 +284,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL4x4_M1 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] ld2 {v12.2d, v13.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] ld2 {v14.2d, v15.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] ld2 {v4.2d, v5.2d} , [pA] // For next round add pA, pA, #32 - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] ld2 {v6.2d, v7.2d} , [pA] // For next round add pA, pA, #32 - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] prfm PLDL1KEEP, [pA, #512] - OP_rr v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] - OP_ri v27.2d, v2.2d, v11.2d[0] - OP_ir v27.2d, v3.2d, v10.2d[0] + OP_rr v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] + OP_ri v27.2d, v2.2d, v11.d[0] + OP_ir v27.2d, v3.2d, v10.d[0] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] - OP_rr v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] - OP_ri v31.2d, v2.2d, v11.2d[1] - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_rr v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] + OP_ri v31.2d, v2.2d, v11.d[1] + OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro KERNEL4x4_M2 - OP_rr v16.2d, v4.2d, v12.2d[0] - OP_ii v16.2d, v5.2d, v13.2d[0] - OP_ri v17.2d, v4.2d, v13.2d[0] - OP_ir v17.2d, v5.2d, v12.2d[0] + OP_rr v16.2d, v4.2d, v12.d[0] + OP_ii v16.2d, v5.2d, v13.d[0] + OP_ri v17.2d, v4.2d, v13.d[0] + OP_ir v17.2d, v5.2d, v12.d[0] ld2 {v8.2d, v9.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v18.2d, v6.2d, v12.2d[0] - OP_ii v18.2d, v7.2d, v13.2d[0] - OP_ri v19.2d, v6.2d, v13.2d[0] - OP_ir v19.2d, v7.2d, v12.2d[0] + OP_rr v18.2d, v6.2d, v12.d[0] + OP_ii v18.2d, v7.2d, v13.d[0] + OP_ri v19.2d, v6.2d, v13.d[0] + OP_ir v19.2d, v7.2d, v12.d[0] ld2 {v10.2d, v11.2d}, [pB] // For next round add pB, pB, #32 - OP_rr v20.2d, v4.2d, v12.2d[1] - OP_ii v20.2d, v5.2d, v13.2d[1] - OP_ri v21.2d, v4.2d, v13.2d[1] - OP_ir v21.2d, v5.2d, v12.2d[1] + OP_rr v20.2d, v4.2d, v12.d[1] + OP_ii v20.2d, v5.2d, v13.d[1] + OP_ri v21.2d, v4.2d, v13.d[1] + OP_ir v21.2d, v5.2d, v12.d[1] ld2 {v0.2d, v1.2d}, [pA] // For next round add pA, pA, #32 - OP_rr v22.2d, v6.2d, v12.2d[1] - OP_ii v22.2d, v7.2d, v13.2d[1] - OP_ri v23.2d, v6.2d, v13.2d[1] - OP_ir v23.2d, v7.2d, v12.2d[1] + OP_rr v22.2d, v6.2d, v12.d[1] + OP_ii v22.2d, v7.2d, v13.d[1] + OP_ri v23.2d, v6.2d, v13.d[1] + OP_ir v23.2d, v7.2d, v12.d[1] ld2 {v2.2d, v3.2d}, [pA] // For next round add pA, pA, #32 - OP_rr v24.2d, v4.2d, v14.2d[0] - OP_ii v24.2d, v5.2d, v15.2d[0] - OP_ri v25.2d, v4.2d, v15.2d[0] - OP_ir v25.2d, v5.2d, v14.2d[0] + OP_rr v24.2d, v4.2d, v14.d[0] + OP_ii v24.2d, v5.2d, v15.d[0] + OP_ri v25.2d, v4.2d, v15.d[0] + OP_ir v25.2d, v5.2d, v14.d[0] prfm PLDL1KEEP, [pA, #512] - OP_rr v26.2d, v6.2d, v14.2d[0] - OP_ii v26.2d, v7.2d, v15.2d[0] - OP_ri v27.2d, v6.2d, v15.2d[0] - OP_ir v27.2d, v7.2d, v14.2d[0] + OP_rr v26.2d, v6.2d, v14.d[0] + OP_ii v26.2d, v7.2d, v15.d[0] + OP_ri v27.2d, v6.2d, v15.d[0] + OP_ir v27.2d, v7.2d, v14.d[0] prfm PLDL1KEEP, [pB, #512] - OP_rr v28.2d, v4.2d, v14.2d[1] - OP_ii v28.2d, v5.2d, v15.2d[1] - OP_ri v29.2d, v4.2d, v15.2d[1] - OP_ir v29.2d, v5.2d, v14.2d[1] + OP_rr v28.2d, v4.2d, v14.d[1] + OP_ii v28.2d, v5.2d, v15.d[1] + OP_ri v29.2d, v4.2d, v15.d[1] + OP_ir v29.2d, v5.2d, v14.d[1] - OP_rr v30.2d, v6.2d, v14.2d[1] - OP_ii v30.2d, v7.2d, v15.2d[1] - OP_ri v31.2d, v6.2d, v15.2d[1] - OP_ir v31.2d, v7.2d, v14.2d[1] + OP_rr v30.2d, v6.2d, v14.d[1] + OP_ii v30.2d, v7.2d, v15.d[1] + OP_ri v31.2d, v6.2d, v15.d[1] + OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_E - OP_rr v16.2d, v4.2d, v12.2d[0] - OP_ii v16.2d, v5.2d, v13.2d[0] - OP_ri v17.2d, v4.2d, v13.2d[0] - OP_ir v17.2d, v5.2d, v12.2d[0] - - OP_rr v18.2d, v6.2d, v12.2d[0] - OP_ii v18.2d, v7.2d, v13.2d[0] - OP_ri v19.2d, v6.2d, v13.2d[0] - OP_ir v19.2d, v7.2d, v12.2d[0] - - OP_rr v20.2d, v4.2d, v12.2d[1] - OP_ii v20.2d, v5.2d, v13.2d[1] - OP_ri v21.2d, v4.2d, v13.2d[1] - OP_ir v21.2d, v5.2d, v12.2d[1] - - OP_rr v22.2d, v6.2d, v12.2d[1] - OP_ii v22.2d, v7.2d, v13.2d[1] - OP_ri v23.2d, v6.2d, v13.2d[1] - OP_ir v23.2d, v7.2d, v12.2d[1] - - OP_rr v24.2d, v4.2d, v14.2d[0] - OP_ii v24.2d, v5.2d, v15.2d[0] - OP_ri v25.2d, v4.2d, v15.2d[0] - OP_ir v25.2d, v5.2d, v14.2d[0] - - OP_rr v26.2d, v6.2d, v14.2d[0] - OP_ii v26.2d, v7.2d, v15.2d[0] - OP_ri v27.2d, v6.2d, v15.2d[0] - OP_ir v27.2d, v7.2d, v14.2d[0] - - OP_rr v28.2d, v4.2d, v14.2d[1] - OP_ii v28.2d, v5.2d, v15.2d[1] - OP_ri v29.2d, v4.2d, v15.2d[1] - OP_ir v29.2d, v5.2d, v14.2d[1] - - OP_rr v30.2d, v6.2d, v14.2d[1] - OP_ii v30.2d, v7.2d, v15.2d[1] - OP_ri v31.2d, v6.2d, v15.2d[1] - OP_ir v31.2d, v7.2d, v14.2d[1] + OP_rr v16.2d, v4.2d, v12.d[0] + OP_ii v16.2d, v5.2d, v13.d[0] + OP_ri v17.2d, v4.2d, v13.d[0] + OP_ir v17.2d, v5.2d, v12.d[0] + + OP_rr v18.2d, v6.2d, v12.d[0] + OP_ii v18.2d, v7.2d, v13.d[0] + OP_ri v19.2d, v6.2d, v13.d[0] + OP_ir v19.2d, v7.2d, v12.d[0] + + OP_rr v20.2d, v4.2d, v12.d[1] + OP_ii v20.2d, v5.2d, v13.d[1] + OP_ri v21.2d, v4.2d, v13.d[1] + OP_ir v21.2d, v5.2d, v12.d[1] + + OP_rr v22.2d, v6.2d, v12.d[1] + OP_ii v22.2d, v7.2d, v13.d[1] + OP_ri v23.2d, v6.2d, v13.d[1] + OP_ir v23.2d, v7.2d, v12.d[1] + + OP_rr v24.2d, v4.2d, v14.d[0] + OP_ii v24.2d, v5.2d, v15.d[0] + OP_ri v25.2d, v4.2d, v15.d[0] + OP_ir v25.2d, v5.2d, v14.d[0] + + OP_rr v26.2d, v6.2d, v14.d[0] + OP_ii v26.2d, v7.2d, v15.d[0] + OP_ri v27.2d, v6.2d, v15.d[0] + OP_ir v27.2d, v7.2d, v14.d[0] + + OP_rr v28.2d, v4.2d, v14.d[1] + OP_ii v28.2d, v5.2d, v15.d[1] + OP_ri v29.2d, v4.2d, v15.d[1] + OP_ir v29.2d, v5.2d, v14.d[1] + + OP_rr v30.2d, v6.2d, v14.d[1] + OP_ii v30.2d, v7.2d, v15.d[1] + OP_ri v31.2d, v6.2d, v15.d[1] + OP_ir v31.2d, v7.2d, v14.d[1] .endm .macro KERNEL4x4_SUB @@ -451,45 +451,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] - - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] - - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] - - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] - - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] - - OP_rr v26.2d, v2.2d, v10.2d[0] - OP_ii v26.2d, v3.2d, v11.2d[0] - OP_ri v27.2d, v2.2d, v11.2d[0] - OP_ir v27.2d, v3.2d, v10.2d[0] - - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] - - OP_rr v30.2d, v2.2d, v10.2d[1] - OP_ii v30.2d, v3.2d, v11.2d[1] - OP_ri v31.2d, v2.2d, v11.2d[1] - OP_ir v31.2d, v3.2d, v10.2d[1] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] + + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] + + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] + + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] + + OP_rr v26.2d, v2.2d, v10.d[0] + OP_ii v26.2d, v3.2d, v11.d[0] + OP_ri v27.2d, v2.2d, v11.d[0] + OP_ir v27.2d, v3.2d, v10.d[0] + + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] + + OP_rr v30.2d, v2.2d, v10.d[1] + OP_ii v30.2d, v3.2d, v11.d[1] + OP_ri v31.2d, v2.2d, v11.d[1] + OP_ir v31.2d, v3.2d, v10.d[1] .endm .macro SAVE4x4 @@ -577,25 +577,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] - - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] - - OP_rr v24.2d, v0.2d, v10.2d[0] - OP_ii v24.2d, v1.2d, v11.2d[0] - OP_ri v25.2d, v0.2d, v11.2d[0] - OP_ir v25.2d, v1.2d, v10.2d[0] - - OP_rr v28.2d, v0.2d, v10.2d[1] - OP_ii v28.2d, v1.2d, v11.2d[1] - OP_ri v29.2d, v0.2d, v11.2d[1] - OP_ir v29.2d, v1.2d, v10.2d[1] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] + + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] + + OP_rr v24.2d, v0.2d, v10.d[0] + OP_ii v24.2d, v1.2d, v11.d[0] + OP_ri v25.2d, v0.2d, v11.d[0] + OP_ir v25.2d, v1.2d, v10.d[0] + + OP_rr v28.2d, v0.2d, v10.d[1] + OP_ii v28.2d, v1.2d, v11.d[1] + OP_ri v29.2d, v0.2d, v11.d[1] + OP_ir v29.2d, v1.2d, v10.d[1] .endm .macro SAVE2x4 @@ -660,25 +660,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 - OP_rr d16, d0, v8.2d[0] - OP_ii d16, d1, v9.2d[0] - OP_ri d17, d0, v9.2d[0] - OP_ir d17, d1, v8.2d[0] - - OP_rr d20, d0, v8.2d[1] - OP_ii d20, d1, v9.2d[1] - OP_ri d21, d0, v9.2d[1] - OP_ir d21, d1, v8.2d[1] - - OP_rr d24, d0, v10.2d[0] - OP_ii d24, d1, v11.2d[0] - OP_ri d25, d0, v11.2d[0] - OP_ir d25, d1, v10.2d[0] - - OP_rr d28, d0, v10.2d[1] - OP_ii d28, d1, v11.2d[1] - OP_ri d29, d0, v11.2d[1] - OP_ir d29, d1, v10.2d[1] + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] + + OP_rr d20, d0, v8.d[1] + OP_ii d20, d1, v9.d[1] + OP_ri d21, d0, v9.d[1] + OP_ir d21, d1, v8.d[1] + + OP_rr d24, d0, v10.d[0] + OP_ii d24, d1, v11.d[0] + OP_ri d25, d0, v11.d[0] + OP_ir d25, d1, v10.d[0] + + OP_rr d28, d0, v10.d[1] + OP_ii d28, d1, v11.d[1] + OP_ri d29, d0, v11.d[1] + OP_ir d29, d1, v10.d[1] .endm .macro SAVE1x4 @@ -743,25 +743,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] - - OP_rr v18.2d, v2.2d, v8.2d[0] - OP_ii v18.2d, v3.2d, v9.2d[0] - OP_ri v19.2d, v2.2d, v9.2d[0] - OP_ir v19.2d, v3.2d, v8.2d[0] - - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] - - OP_rr v22.2d, v2.2d, v8.2d[1] - OP_ii v22.2d, v3.2d, v9.2d[1] - OP_ri v23.2d, v2.2d, v9.2d[1] - OP_ir v23.2d, v3.2d, v8.2d[1] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] + + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] + + OP_rr v22.2d, v2.2d, v8.d[1] + OP_ii v22.2d, v3.2d, v9.d[1] + OP_ri v23.2d, v2.2d, v9.d[1] + OP_ir v23.2d, v3.2d, v8.d[1] .endm .macro SAVE4x2 @@ -816,15 +816,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - OP_rr v16.2d, v0.2d, v8.2d[0] - OP_ii v16.2d, v1.2d, v9.2d[0] - OP_ri v17.2d, v0.2d, v9.2d[0] - OP_ir v17.2d, v1.2d, v8.2d[0] + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v20.2d, v0.2d, v8.2d[1] - OP_ii v20.2d, v1.2d, v9.2d[1] - OP_ri v21.2d, v0.2d, v9.2d[1] - OP_ir v21.2d, v1.2d, v8.2d[1] + OP_rr v20.2d, v0.2d, v8.d[1] + OP_ii v20.2d, v1.2d, v9.d[1] + OP_ri v21.2d, v0.2d, v9.d[1] + OP_ir v21.2d, v1.2d, v8.d[1] .endm .macro SAVE2x2 @@ -867,15 +867,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pA] add pA, pA, #16 - OP_rr d16, d0, v8.2d[0] - OP_ii d16, d1, v9.2d[0] - OP_ri d17, d0, v9.2d[0] - OP_ir d17, d1, v8.2d[0] + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] - OP_rr d20, d0, v8.2d[1] - OP_ii d20, d1, v9.2d[1] - OP_ri d21, d0, v9.2d[1] - OP_ir d21, d1, v8.2d[1] + OP_rr d20, d0, v8.d[1] + OP_ii d20, d1, v9.d[1] + OP_ri d21, d0, v9.d[1] + OP_ir d21, d1, v8.d[1] .endm .macro SAVE1x2 From 1e6cf9808c08c372962f1b762a4c1a7665367491 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 25 Mar 2016 09:42:08 +0100 Subject: [PATCH 18/48] added optimized dscal kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 4 +- kernel/power/dscal.c | 174 +++++++++++++++++++++++ kernel/power/dscal_microk_power8.c | 219 +++++++++++++++++++++++++++++ 3 files changed, 395 insertions(+), 2 deletions(-) create mode 100644 kernel/power/dscal.c create mode 100644 kernel/power/dscal_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 1f9cef0e5..5fb44cbdb 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -136,9 +136,9 @@ ZDOTKERNEL = zdot.c #ZROTKERNEL = ../arm/zrot.c # #SSCALKERNEL = ../arm/scal.c -#DSCALKERNEL = ../arm/scal.c +DSCALKERNEL = dscal.c #CSCALKERNEL = ../arm/zscal.c -#ZSCALKERNEL = ../arm/zscal.c +#ZSCALKERNEL = zscal.c # #SSWAPKERNEL = ../arm/swap.c #DSWAPKERNEL = ../arm/swap.c diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c new file mode 100644 index 000000000..c62a56315 --- /dev/null +++ b/kernel/power/dscal.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "dscal_microk_power8.c" +#endif + +#if !defined(HAVE_KERNEL_8) + +static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) +{ + + BLASLONG i; + FLOAT alpha = *da; + + for( i=0; i 0 ) + { + FLOAT alpha[2]; + alpha[0]=da; + alpha[1]=da; + dscal_kernel_8_zero(n1 , alpha , x); + j=n1; + } + + while(j < n) + { + + x[j]=0.0; + j++; + } + + } + else + { + + BLASLONG n1 = n & -16; + if ( n1 > 0 ) + { + FLOAT alpha[2]; + alpha[0]=da; + alpha[1]=da; + dscal_kernel_8(n1 , alpha , x); + j=n1; + } + while(j < n) + { + + x[j] = da * x[j] ; + j++; + } + } + + + } + else + { + + if ( da == 0.0 ) + { + + while(j < n) + { + + x[i]=0.0; + i += inc_x ; + j++; + } + + } + else + { + + while(j < n) + { + + x[i] = da * x[i] ; + i += inc_x ; + j++; + } + } + + } + return 0; + +} + + diff --git a/kernel/power/dscal_microk_power8.c b/kernel/power/dscal_microk_power8.c new file mode 100644 index 000000000..d90c3d80c --- /dev/null +++ b/kernel/power/dscal_microk_power8.c @@ -0,0 +1,219 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *x2=x+1; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "lxsdx 33, 0, %3 \n\t" + "xxspltd 32, 33, 0 \n\t" + "addi %1, %1, -8 \n\t" + + "dcbt %2, %4 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %4 \n\t" + + "xvmuldp 48, 40, 32 \n\t" + "xvmuldp 49, 41, 32 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "xvmuldp 52, 44, 32 \n\t" + "xvmuldp 53, 45, 32 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "xvmuldp 54, 46, 32 \n\t" + "xvmuldp 55, 47, 32 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "stxvd2x 48, 0, %1 \n\t" + "stxvd2x 49, %5, %1 \n\t" + "stxvd2x 50, %6, %1 \n\t" + "stxvd2x 51, %7, %1 \n\t" + "stxvd2x 52, %8, %1 \n\t" + "stxvd2x 53, %9, %1 \n\t" + "stxvd2x 54, %10, %1 \n\t" + "stxvd2x 55, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmuldp 48, 40, 32 \n\t" + "xvmuldp 49, 41, 32 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "xvmuldp 52, 44, 32 \n\t" + "xvmuldp 53, 45, 32 \n\t" + "xvmuldp 54, 46, 32 \n\t" + "xvmuldp 55, 47, 32 \n\t" + + "stxvd2x 48, 0, %1 \n\t" + "stxvd2x 49, %5, %1 \n\t" + "stxvd2x 50, %6, %1 \n\t" + "stxvd2x 51, %7, %1 \n\t" + "stxvd2x 52, %8, %1 \n\t" + "stxvd2x 53, %9, %1 \n\t" + "stxvd2x 54, %10, %1 \n\t" + "stxvd2x 55, %11, %1 \n\t" + + : + : + "r" (i), // 0 + "r" (x2), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *x2=x+1; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "xxlxor 32 , 32 , 32 \n\t" + "addi %1, %1, -8 \n\t" + + + ".align 5 \n\t" + "1: \n\t" + + "stxvd2x 32, 0, %1 \n\t" + "stxvd2x 32, %5, %1 \n\t" + "stxvd2x 32, %6, %1 \n\t" + "stxvd2x 32, %7, %1 \n\t" + "stxvd2x 32, %8, %1 \n\t" + "stxvd2x 32, %9, %1 \n\t" + "stxvd2x 32, %10, %1 \n\t" + "stxvd2x 32, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (x2), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + From 0bff057a874a706e25ac5e02474327ed50f95444 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 25 Mar 2016 13:03:02 +0100 Subject: [PATCH 19/48] added optimized dcopy kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/dcopy.c | 131 ++++++++++++++++++++++ kernel/power/dcopy_microk_power8.c | 174 +++++++++++++++++++++++++++++ 3 files changed, 306 insertions(+), 1 deletion(-) create mode 100644 kernel/power/dcopy.c create mode 100644 kernel/power/dcopy_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 5fb44cbdb..a0e4ac3f9 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -116,7 +116,7 @@ DAXPYKERNEL = daxpy.c ZAXPYKERNEL = zaxpy.c # #SCOPYKERNEL = ../arm/copy.c -#DCOPYKERNEL = ../arm/copy.c +DCOPYKERNEL = dcopy.c #CCOPYKERNEL = ../arm/zcopy.c #ZCOPYKERNEL = ../arm/zcopy.c # diff --git a/kernel/power/dcopy.c b/kernel/power/dcopy.c new file mode 100644 index 000000000..059c0e5a9 --- /dev/null +++ b/kernel/power/dcopy.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "dcopy_microk_power8.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + dcopy_kernel_32(n1, x, y); + i=n1; + } + + while(i < n) + { + y[i] = x[i] ; + i++ ; + + } + + + } + else + { + + while(i < n) + { + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/dcopy_microk_power8.c b/kernel/power/dcopy_microk_power8.c new file mode 100644 index 000000000..04f7db556 --- /dev/null +++ b/kernel/power/dcopy_microk_power8.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_32 1 + +static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvd2x 50, 0, %2 \n\t" + "lxvd2x 51, %5, %2 \n\t" + "lxvd2x 52, %6, %2 \n\t" + "lxvd2x 53, %7, %2 \n\t" + "lxvd2x 54, %8, %2 \n\t" + "lxvd2x 55, %9, %2 \n\t" + "lxvd2x 56, %10, %2 \n\t" + "lxvd2x 57, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "stxvd2x 40, 0, %1 \n\t" + "stxvd2x 41, %5, %1 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "stxvd2x 42, %6, %1 \n\t" + "stxvd2x 43, %7, %1 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "stxvd2x 44, %8, %1 \n\t" + "stxvd2x 45, %9, %1 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "stxvd2x 46, %10, %1 \n\t" + "stxvd2x 47, %11, %1 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "stxvd2x 50, 0, %1 \n\t" + "stxvd2x 51, %5, %1 \n\t" + "lxvd2x 50, 0, %2 \n\t" + "lxvd2x 51, %5, %2 \n\t" + "stxvd2x 52, %6, %1 \n\t" + "stxvd2x 53, %7, %1 \n\t" + "lxvd2x 52, %6, %2 \n\t" + "lxvd2x 53, %7, %2 \n\t" + "stxvd2x 54, %8, %1 \n\t" + "stxvd2x 55, %9, %1 \n\t" + "lxvd2x 54, %8, %2 \n\t" + "lxvd2x 55, %9, %2 \n\t" + "stxvd2x 56, %10, %1 \n\t" + "stxvd2x 57, %11, %1 \n\t" + "lxvd2x 56, %10, %2 \n\t" + "lxvd2x 57, %11, %2 \n\t" + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "stxvd2x 40, 0, %1 \n\t" + "stxvd2x 41, %5, %1 \n\t" + "stxvd2x 42, %6, %1 \n\t" + "stxvd2x 43, %7, %1 \n\t" + "stxvd2x 44, %8, %1 \n\t" + "stxvd2x 45, %9, %1 \n\t" + "stxvd2x 46, %10, %1 \n\t" + "stxvd2x 47, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "stxvd2x 50, 0, %1 \n\t" + "stxvd2x 51, %5, %1 \n\t" + "stxvd2x 52, %6, %1 \n\t" + "stxvd2x 53, %7, %1 \n\t" + "stxvd2x 54, %8, %1 \n\t" + "stxvd2x 55, %9, %1 \n\t" + "stxvd2x 56, %10, %1 \n\t" + "stxvd2x 57, %11, %1 \n\t" + + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + From 7316a879301235148c0501ad3f6be243a48a1b58 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 25 Mar 2016 14:35:43 +0100 Subject: [PATCH 20/48] added optimized dswap kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/dswap.c | 154 ++++++++++++++++++++++++ kernel/power/dswap_microk_power8.c | 180 +++++++++++++++++++++++++++++ 3 files changed, 335 insertions(+), 1 deletion(-) create mode 100644 kernel/power/dswap.c create mode 100644 kernel/power/dswap_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index a0e4ac3f9..fc6a729fe 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -141,7 +141,7 @@ DSCALKERNEL = dscal.c #ZSCALKERNEL = zscal.c # #SSWAPKERNEL = ../arm/swap.c -#DSWAPKERNEL = ../arm/swap.c +DSWAPKERNEL = dswap.c #CSWAPKERNEL = ../arm/zswap.c #ZSWAPKERNEL = ../arm/zswap.c # diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c new file mode 100644 index 000000000..fd2dec9c4 --- /dev/null +++ b/kernel/power/dswap.c @@ -0,0 +1,154 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "dswap_microk_power8.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT g0, g1, g2, g3, g4, g5, g6, g7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + dswap_kernel_32(n1, x, y); + i=n1; + } + + while(i < n) + { + temp = y[i]; + y[i] = x[i] ; + x[i] = temp; + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = y[iy]; + y[iy] = x[ix] ; + x[ix] = temp; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/dswap_microk_power8.c b/kernel/power/dswap_microk_power8.c new file mode 100644 index 000000000..77747c3b9 --- /dev/null +++ b/kernel/power/dswap_microk_power8.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_32 1 + +static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *x2=x+1; + FLOAT *y2=y+1; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "addi %3, %3, -8 \n\t" + "addi %4, %4, -8 \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %5, %2 \n\t" + "lxvd2x 34, %6, %2 \n\t" + "lxvd2x 35, %7, %2 \n\t" + "lxvd2x 36, %8, %2 \n\t" + "lxvd2x 37, %9, %2 \n\t" + "lxvd2x 38, %10, %2 \n\t" + "lxvd2x 39, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvd2x 48, 0, %1 \n\t" + "lxvd2x 49, %5, %1 \n\t" + "lxvd2x 50, %6, %1 \n\t" + "lxvd2x 51, %7, %1 \n\t" + "lxvd2x 52, %8, %1 \n\t" + "lxvd2x 53, %9, %1 \n\t" + "lxvd2x 54, %10, %1 \n\t" + "lxvd2x 55, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "lxvd2x 56, 0, %1 \n\t" + "lxvd2x 57, %5, %1 \n\t" + "lxvd2x 58, %6, %1 \n\t" + "lxvd2x 59, %7, %1 \n\t" + "lxvd2x 60, %8, %1 \n\t" + "lxvd2x 61, %9, %1 \n\t" + "lxvd2x 62, %10, %1 \n\t" + "lxvd2x 63, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" + + "addi %3, %3, 128 \n\t" + + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" + + "addi %3, %3, 128 \n\t" + + "stxvd2x 48, 0, %4 \n\t" + "stxvd2x 49, %5, %4 \n\t" + "stxvd2x 50, %6, %4 \n\t" + "stxvd2x 51, %7, %4 \n\t" + "stxvd2x 52, %8, %4 \n\t" + "stxvd2x 53, %9, %4 \n\t" + "stxvd2x 54, %10, %4 \n\t" + "stxvd2x 55, %11, %4 \n\t" + + "addi %4, %4, 128 \n\t" + + "stxvd2x 56, 0, %4 \n\t" + "stxvd2x 57, %5, %4 \n\t" + "stxvd2x 58, %6, %4 \n\t" + "stxvd2x 59, %7, %4 \n\t" + "stxvd2x 60, %8, %4 \n\t" + "stxvd2x 61, %9, %4 \n\t" + "stxvd2x 62, %10, %4 \n\t" + "stxvd2x 63, %11, %4 \n\t" + + "addi %4, %4, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (y2), // 3 + "r" (x2), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory" + ); + +} + + From 12f209b7b0bfbb2f6dda8c7259187b161a03511f Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 25 Mar 2016 15:27:34 +0100 Subject: [PATCH 21/48] added optimized zswap kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/zcopy.c | 140 +++++++++++++++++++++++ kernel/power/zcopy_microk_power8.c | 174 +++++++++++++++++++++++++++++ 3 files changed, 315 insertions(+), 1 deletion(-) create mode 100644 kernel/power/zcopy.c create mode 100644 kernel/power/zcopy_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index fc6a729fe..4f410b005 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -118,7 +118,7 @@ ZAXPYKERNEL = zaxpy.c #SCOPYKERNEL = ../arm/copy.c DCOPYKERNEL = dcopy.c #CCOPYKERNEL = ../arm/zcopy.c -#ZCOPYKERNEL = ../arm/zcopy.c +ZCOPYKERNEL = zcopy.c # SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c new file mode 100644 index 000000000..a7658f7ab --- /dev/null +++ b/kernel/power/zcopy.c @@ -0,0 +1,140 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "zcopy_microk_power8.c" +#endif + +#ifndef HAVE_KERNEL_16 + +static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + zcopy_kernel_16(n1, x, y); + i=n1; + ix=n1*2; + iy=n1*2; + } + + while(i < n) + { + y[iy] = x[iy] ; + y[iy+1] = x[ix+1] ; + ix+=2; + iy+=2; + i++ ; + + } + + + } + else + { + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/zcopy_microk_power8.c b/kernel/power/zcopy_microk_power8.c new file mode 100644 index 000000000..73abe084e --- /dev/null +++ b/kernel/power/zcopy_microk_power8.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvd2x 50, 0, %2 \n\t" + "lxvd2x 51, %5, %2 \n\t" + "lxvd2x 52, %6, %2 \n\t" + "lxvd2x 53, %7, %2 \n\t" + "lxvd2x 54, %8, %2 \n\t" + "lxvd2x 55, %9, %2 \n\t" + "lxvd2x 56, %10, %2 \n\t" + "lxvd2x 57, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "stxvd2x 40, 0, %1 \n\t" + "stxvd2x 41, %5, %1 \n\t" + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "stxvd2x 42, %6, %1 \n\t" + "stxvd2x 43, %7, %1 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "stxvd2x 44, %8, %1 \n\t" + "stxvd2x 45, %9, %1 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "stxvd2x 46, %10, %1 \n\t" + "stxvd2x 47, %11, %1 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "stxvd2x 50, 0, %1 \n\t" + "stxvd2x 51, %5, %1 \n\t" + "lxvd2x 50, 0, %2 \n\t" + "lxvd2x 51, %5, %2 \n\t" + "stxvd2x 52, %6, %1 \n\t" + "stxvd2x 53, %7, %1 \n\t" + "lxvd2x 52, %6, %2 \n\t" + "lxvd2x 53, %7, %2 \n\t" + "stxvd2x 54, %8, %1 \n\t" + "stxvd2x 55, %9, %1 \n\t" + "lxvd2x 54, %8, %2 \n\t" + "lxvd2x 55, %9, %2 \n\t" + "stxvd2x 56, %10, %1 \n\t" + "stxvd2x 57, %11, %1 \n\t" + "lxvd2x 56, %10, %2 \n\t" + "lxvd2x 57, %11, %2 \n\t" + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "stxvd2x 40, 0, %1 \n\t" + "stxvd2x 41, %5, %1 \n\t" + "stxvd2x 42, %6, %1 \n\t" + "stxvd2x 43, %7, %1 \n\t" + "stxvd2x 44, %8, %1 \n\t" + "stxvd2x 45, %9, %1 \n\t" + "stxvd2x 46, %10, %1 \n\t" + "stxvd2x 47, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "stxvd2x 50, 0, %1 \n\t" + "stxvd2x 51, %5, %1 \n\t" + "stxvd2x 52, %6, %1 \n\t" + "stxvd2x 53, %7, %1 \n\t" + "stxvd2x 54, %8, %1 \n\t" + "stxvd2x 55, %9, %1 \n\t" + "stxvd2x 56, %10, %1 \n\t" + "stxvd2x 57, %11, %1 \n\t" + + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + From ecc0bc981313fa94ad5ae1ef22288733cc6fd18a Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 25 Mar 2016 16:06:56 +0100 Subject: [PATCH 22/48] added optimized scopy kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/scopy.c | 131 +++++++++++++++++++++++++++++ kernel/power/scopy_microk_power8.c | 131 +++++++++++++++++++++++++++++ 3 files changed, 263 insertions(+), 1 deletion(-) create mode 100644 kernel/power/scopy.c create mode 100644 kernel/power/scopy_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 4f410b005..dee0c2660 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -115,7 +115,7 @@ DAXPYKERNEL = daxpy.c #CAXPYKERNEL = ../arm/zaxpy.c ZAXPYKERNEL = zaxpy.c # -#SCOPYKERNEL = ../arm/copy.c +SCOPYKERNEL = scopy.c DCOPYKERNEL = dcopy.c #CCOPYKERNEL = ../arm/zcopy.c ZCOPYKERNEL = zcopy.c diff --git a/kernel/power/scopy.c b/kernel/power/scopy.c new file mode 100644 index 000000000..167c29bab --- /dev/null +++ b/kernel/power/scopy.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "scopy_microk_power8.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static void scopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + scopy_kernel_32(n1, x, y); + i=n1; + } + + while(i < n) + { + y[i] = x[i] ; + i++ ; + + } + + + } + else + { + + while(i < n) + { + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c new file mode 100644 index 000000000..2e08e3561 --- /dev/null +++ b/kernel/power/scopy_microk_power8.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_32 1 + +static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "stxvw4x 40, 0, %1 \n\t" + "stxvw4x 41, %5, %1 \n\t" + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "stxvw4x 42, %6, %1 \n\t" + "stxvw4x 43, %7, %1 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "stxvw4x 44, %8, %1 \n\t" + "stxvw4x 45, %9, %1 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "stxvw4x 46, %10, %1 \n\t" + "stxvw4x 47, %11, %1 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "stxvw4x 40, 0, %1 \n\t" + "stxvw4x 41, %5, %1 \n\t" + "stxvw4x 42, %6, %1 \n\t" + "stxvw4x 43, %7, %1 \n\t" + "stxvw4x 44, %8, %1 \n\t" + "stxvw4x 45, %9, %1 \n\t" + "stxvw4x 46, %10, %1 \n\t" + "stxvw4x 47, %11, %1 \n\t" + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + From 828c849b446723401e10d05f0974d57982adab09 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 25 Mar 2016 16:54:25 +0100 Subject: [PATCH 23/48] added optimized ccopy kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/ccopy.c | 140 +++++++++++++++++++++++ kernel/power/ccopy_microk_power8.c | 174 +++++++++++++++++++++++++++++ 3 files changed, 315 insertions(+), 1 deletion(-) create mode 100644 kernel/power/ccopy.c create mode 100644 kernel/power/ccopy_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index dee0c2660..576ad506d 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -117,7 +117,7 @@ ZAXPYKERNEL = zaxpy.c # SCOPYKERNEL = scopy.c DCOPYKERNEL = dcopy.c -#CCOPYKERNEL = ../arm/zcopy.c +CCOPYKERNEL = ccopy.c ZCOPYKERNEL = zcopy.c # SDOTKERNEL = sdot.c diff --git a/kernel/power/ccopy.c b/kernel/power/ccopy.c new file mode 100644 index 000000000..ce7d67475 --- /dev/null +++ b/kernel/power/ccopy.c @@ -0,0 +1,140 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "ccopy_microk_power8.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + ccopy_kernel_32(n1, x, y); + i=n1; + ix=n1*2; + iy=n1*2; + } + + while(i < n) + { + y[iy] = x[iy] ; + y[iy+1] = x[ix+1] ; + ix+=2; + iy+=2; + i++ ; + + } + + + } + else + { + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c new file mode 100644 index 000000000..95b3559ba --- /dev/null +++ b/kernel/power/ccopy_microk_power8.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_32 1 + +static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvw4x 50, 0, %2 \n\t" + "lxvw4x 51, %5, %2 \n\t" + "lxvw4x 52, %6, %2 \n\t" + "lxvw4x 53, %7, %2 \n\t" + "lxvw4x 54, %8, %2 \n\t" + "lxvw4x 55, %9, %2 \n\t" + "lxvw4x 56, %10, %2 \n\t" + "lxvw4x 57, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "stxvw4x 40, 0, %1 \n\t" + "stxvw4x 41, %5, %1 \n\t" + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "stxvw4x 42, %6, %1 \n\t" + "stxvw4x 43, %7, %1 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "stxvw4x 44, %8, %1 \n\t" + "stxvw4x 45, %9, %1 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "stxvw4x 46, %10, %1 \n\t" + "stxvw4x 47, %11, %1 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "stxvw4x 50, 0, %1 \n\t" + "stxvw4x 51, %5, %1 \n\t" + "lxvw4x 50, 0, %2 \n\t" + "lxvw4x 51, %5, %2 \n\t" + "stxvw4x 52, %6, %1 \n\t" + "stxvw4x 53, %7, %1 \n\t" + "lxvw4x 52, %6, %2 \n\t" + "lxvw4x 53, %7, %2 \n\t" + "stxvw4x 54, %8, %1 \n\t" + "stxvw4x 55, %9, %1 \n\t" + "lxvw4x 54, %8, %2 \n\t" + "lxvw4x 55, %9, %2 \n\t" + "stxvw4x 56, %10, %1 \n\t" + "stxvw4x 57, %11, %1 \n\t" + "lxvw4x 56, %10, %2 \n\t" + "lxvw4x 57, %11, %2 \n\t" + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "stxvw4x 40, 0, %1 \n\t" + "stxvw4x 41, %5, %1 \n\t" + "stxvw4x 42, %6, %1 \n\t" + "stxvw4x 43, %7, %1 \n\t" + "stxvw4x 44, %8, %1 \n\t" + "stxvw4x 45, %9, %1 \n\t" + "stxvw4x 46, %10, %1 \n\t" + "stxvw4x 47, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "stxvw4x 50, 0, %1 \n\t" + "stxvw4x 51, %5, %1 \n\t" + "stxvw4x 52, %6, %1 \n\t" + "stxvw4x 53, %7, %1 \n\t" + "stxvw4x 54, %8, %1 \n\t" + "stxvw4x 55, %9, %1 \n\t" + "stxvw4x 56, %10, %1 \n\t" + "stxvw4x 57, %11, %1 \n\t" + + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + From 3d9a50e8414ac7d9b547d43f864fcde0ed542e26 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 25 Mar 2016 17:34:55 +0100 Subject: [PATCH 24/48] added optimized sswap kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/sswap.c | 154 +++++++++++++++++++++++++++++ kernel/power/sswap_microk_power8.c | 136 +++++++++++++++++++++++++ 3 files changed, 291 insertions(+), 1 deletion(-) create mode 100644 kernel/power/sswap.c create mode 100644 kernel/power/sswap_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 576ad506d..0f4516812 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -140,7 +140,7 @@ DSCALKERNEL = dscal.c #CSCALKERNEL = ../arm/zscal.c #ZSCALKERNEL = zscal.c # -#SSWAPKERNEL = ../arm/swap.c +SSWAPKERNEL = sswap.c DSWAPKERNEL = dswap.c #CSWAPKERNEL = ../arm/zswap.c #ZSWAPKERNEL = ../arm/zswap.c diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c new file mode 100644 index 000000000..932652b37 --- /dev/null +++ b/kernel/power/sswap.c @@ -0,0 +1,154 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "sswap_microk_power8.c" +#endif + +#ifndef HAVE_KERNEL_32 + +static void sswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT g0, g1, g2, g3, g4, g5, g6, g7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + sswap_kernel_32(n1, x, y); + i=n1; + } + + while(i < n) + { + temp = y[i]; + y[i] = x[i] ; + x[i] = temp; + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = y[iy]; + y[iy] = x[ix] ; + x[ix] = temp; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c new file mode 100644 index 000000000..c48e743de --- /dev/null +++ b/kernel/power/sswap_microk_power8.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_32 1 + +static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *x2=x+1; + FLOAT *y2=y+1; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "addi %3, %3, -4 \n\t" + "addi %4, %4, -4 \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "lxvw4x 32, 0, %2 \n\t" + "lxvw4x 33, %5, %2 \n\t" + "lxvw4x 34, %6, %2 \n\t" + "lxvw4x 35, %7, %2 \n\t" + "lxvw4x 36, %8, %2 \n\t" + "lxvw4x 37, %9, %2 \n\t" + "lxvw4x 38, %10, %2 \n\t" + "lxvw4x 39, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvw4x 48, 0, %1 \n\t" + "lxvw4x 49, %5, %1 \n\t" + "lxvw4x 50, %6, %1 \n\t" + "lxvw4x 51, %7, %1 \n\t" + "lxvw4x 52, %8, %1 \n\t" + "lxvw4x 53, %9, %1 \n\t" + "lxvw4x 54, %10, %1 \n\t" + "lxvw4x 55, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "stxvw4x 32, 0, %3 \n\t" + "stxvw4x 33, %5, %3 \n\t" + "stxvw4x 34, %6, %3 \n\t" + "stxvw4x 35, %7, %3 \n\t" + "stxvw4x 36, %8, %3 \n\t" + "stxvw4x 37, %9, %3 \n\t" + "stxvw4x 38, %10, %3 \n\t" + "stxvw4x 39, %11, %3 \n\t" + + "addi %3, %3, 128 \n\t" + + "stxvw4x 48, 0, %4 \n\t" + "stxvw4x 49, %5, %4 \n\t" + "stxvw4x 50, %6, %4 \n\t" + "stxvw4x 51, %7, %4 \n\t" + "stxvw4x 52, %8, %4 \n\t" + "stxvw4x 53, %9, %4 \n\t" + "stxvw4x 54, %10, %4 \n\t" + "stxvw4x 55, %11, %4 \n\t" + + "addi %4, %4, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (y2), // 3 + "r" (x2), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory" + ); + +} + + From 7a92c1538e1b82431b3d114be66b1f11ae92e888 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sat, 26 Mar 2016 07:14:13 +0100 Subject: [PATCH 25/48] added benchmark test for srot and drot --- benchmark/Makefile | 42 ++++++++++ benchmark/rot.c | 197 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 239 insertions(+) create mode 100644 benchmark/rot.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 11d3c5bec..4692c640e 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -44,6 +44,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto \ + srot.goto drot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -151,6 +152,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \ + srot.goto drot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -1413,6 +1415,39 @@ zdot.mkl : zdot-intel.$(SUFFIX) zdot.veclib : zdot-intel.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Srot #################################################### +srot.goto : srot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +srot.acml : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.atlas : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.mkl : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srot.veclib : srot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Drot #################################################### +drot.goto : drot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +drot.acml : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.atlas : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.mkl : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drot.veclib : drot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + ##################################### Saxpy #################################################### saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2124,6 +2159,13 @@ cgesv.$(SUFFIX) : gesv.c zgesv.$(SUFFIX) : gesv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +srot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + + diff --git a/benchmark/rot.c b/benchmark/rot.c new file mode 100644 index 000000000..32322bebb --- /dev/null +++ b/benchmark/rot.c @@ -0,0 +1,197 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef DOT + + +#ifdef DOUBLE +#define ROT BLASFUNC(drot) +#else +#define ROT BLASFUNC(srot) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + // FLOAT result; + blasint m, i; + blasint inc_x=1,inc_y=1; + FLOAT c[1] = { 2.0 }; + FLOAT s[1] = { 2.0 }; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l Date: Sun, 27 Mar 2016 08:57:11 +0200 Subject: [PATCH 26/48] added drot- and srot-kernel optimimized for POWER8 --- kernel/power/KERNEL.POWER8 | 4 +- kernel/power/drot.c | 167 +++++++++++++++++++++++ kernel/power/drot_microk_power8.c | 211 ++++++++++++++++++++++++++++++ kernel/power/srot.c | 167 +++++++++++++++++++++++ kernel/power/srot_microk_power8.c | 208 +++++++++++++++++++++++++++++ 5 files changed, 755 insertions(+), 2 deletions(-) create mode 100644 kernel/power/drot.c create mode 100644 kernel/power/drot_microk_power8.c create mode 100644 kernel/power/srot.c create mode 100644 kernel/power/srot_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 0f4516812..99bb38096 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -130,8 +130,8 @@ ZDOTKERNEL = zdot.c #CNRM2KERNEL = ../arm/znrm2.c #ZNRM2KERNEL = ../arm/znrm2.c # -#SROTKERNEL = ../arm/rot.c -#DROTKERNEL = ../arm/rot.c +SROTKERNEL = srot.c +DROTKERNEL = drot.c #CROTKERNEL = ../arm/zrot.c #ZROTKERNEL = ../arm/zrot.c # diff --git a/kernel/power/drot.c b/kernel/power/drot.c new file mode 100644 index 000000000..c93f69b12 --- /dev/null +++ b/kernel/power/drot.c @@ -0,0 +1,167 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + +#include "common.h" + +#pragma GCC optimize "O1" + +#if defined(POWER8) +#include "drot_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3; + FLOAT x00, x01, x02, x03; + FLOAT g0, g1, g2, g3; + FLOAT y00, y01, y02, y03; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT c1=*c; + FLOAT s1=*s; + + while ( i 0 ) + { + c1[0]=c; + c1[1]=c; + c1[2]=c; + c1[3]=c; + s1[0]=s; + s1[1]=s; + s1[2]=s; + s1[3]=s; + drot_kernel_16(n1, x1, y1, c1, s1); + i=n1; + } + + while(i < n) + { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/power/drot_microk_power8.c b/kernel/power/drot_microk_power8.c new file mode 100644 index 000000000..4444ac7eb --- /dev/null +++ b/kernel/power/drot_microk_power8.c @@ -0,0 +1,211 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* +* I don't use fused multiply-add ( precision problems with lapack ) +* +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline)); + +static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *x2=x+1; + FLOAT *y2=y+1; + + __asm__ __volatile__ + ( + + "lxsdx 36 , %5, %3 \n\t" // load c + "lxsdx 37 , %5, %4 \n\t" // load s + "addi %8 , %8, -8 \n\t" + "addi %9 , %9, -8 \n\t" + + "xxspltd 36 , 36, 0 \n\t" + "xxspltd 37 , 37, 0 \n\t" + + "lxvd2x 32, 0, %1 \n\t" // load x + "lxvd2x 33, %5, %1 \n\t" + "lxvd2x 34, %6, %1 \n\t" + "lxvd2x 35, %7, %1 \n\t" + + "lxvd2x 40, 0, %2 \n\t" // load y + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + + "addi %1, %1, 64 \n\t" + "addi %2, %2, 64 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "xvmuldp 48, 32, 36 \n\t" // c * x + "xvmuldp 49, 33, 36 \n\t" + "xvmuldp 50, 34, 36 \n\t" + "xvmuldp 51, 35, 36 \n\t" + + "xvmuldp 56, 40, 36 \n\t" // c * y + "xvmuldp 57, 41, 36 \n\t" + "xvmuldp 58, 42, 36 \n\t" + "xvmuldp 59, 43, 36 \n\t" + + "xvmuldp 52, 32, 37 \n\t" // s * x + "xvmuldp 53, 33, 37 \n\t" + + "lxvd2x 32, 0, %1 \n\t" // load x + "lxvd2x 33, %5, %1 \n\t" + + "xvmuldp 54, 34, 37 \n\t" + "xvmuldp 55, 35, 37 \n\t" + + "lxvd2x 34, %6, %1 \n\t" + "lxvd2x 35, %7, %1 \n\t" + + "xvmuldp 60, 40, 37 \n\t" // s * y + "xvmuldp 61, 41, 37 \n\t" + + "lxvd2x 40, 0, %2 \n\t" // load y + "lxvd2x 41, %5, %2 \n\t" + + "xvmuldp 62, 42, 37 \n\t" + "xvmuldp 63, 43, 37 \n\t" + + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + + "xvadddp 48, 48 , 60 \n\t" // c * x + s * y + "xvadddp 49, 49 , 61 \n\t" // c * x + s * y + + "addi %1, %1, 64 \n\t" + "addi %2, %2, 64 \n\t" + + "xvadddp 50, 50 , 62 \n\t" // c * x + s * y + "xvadddp 51, 51 , 63 \n\t" // c * x + s * y + + "xvsubdp 56, 56 , 52 \n\t" // c * y - s * x + "xvsubdp 57, 57 , 53 \n\t" // c * y - s * x + "xvsubdp 58, 58 , 54 \n\t" // c * y - s * x + "xvsubdp 59, 59 , 55 \n\t" // c * y - s * x + + "stxvd2x 48, 0, %8 \n\t" // store x + "stxvd2x 49, %5, %8 \n\t" + "stxvd2x 50, %6, %8 \n\t" + "stxvd2x 51, %7, %8 \n\t" + + "stxvd2x 56, 0, %9 \n\t" // store y + "stxvd2x 57, %5, %9 \n\t" + "stxvd2x 58, %6, %9 \n\t" + "stxvd2x 59, %7, %9 \n\t" + + "addi %8, %8, 64 \n\t" + "addi %9, %9, 64 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmuldp 48, 32, 36 \n\t" // c * x + "xvmuldp 49, 33, 36 \n\t" + "xvmuldp 50, 34, 36 \n\t" + "xvmuldp 51, 35, 36 \n\t" + + "xvmuldp 56, 40, 36 \n\t" // c * y + "xvmuldp 57, 41, 36 \n\t" + "xvmuldp 58, 42, 36 \n\t" + "xvmuldp 59, 43, 36 \n\t" + + "xvmuldp 52, 32, 37 \n\t" // s * x + "xvmuldp 53, 33, 37 \n\t" + "xvmuldp 54, 34, 37 \n\t" + "xvmuldp 55, 35, 37 \n\t" + + "xvmuldp 60, 40, 37 \n\t" // s * y + "xvmuldp 61, 41, 37 \n\t" + "xvmuldp 62, 42, 37 \n\t" + "xvmuldp 63, 43, 37 \n\t" + + "xvadddp 48, 48 , 60 \n\t" // c * x + s * y + "xvadddp 49, 49 , 61 \n\t" // c * x + s * y + "xvadddp 50, 50 , 62 \n\t" // c * x + s * y + "xvadddp 51, 51 , 63 \n\t" // c * x + s * y + + "xvsubdp 56, 56 , 52 \n\t" // c * y - s * x + "xvsubdp 57, 57 , 53 \n\t" // c * y - s * x + "xvsubdp 58, 58 , 54 \n\t" // c * y - s * x + "xvsubdp 59, 59 , 55 \n\t" // c * y - s * x + + "stxvd2x 48, 0, %8 \n\t" // store x + "stxvd2x 49, %5, %8 \n\t" + "stxvd2x 50, %6, %8 \n\t" + "stxvd2x 51, %7, %8 \n\t" + + "stxvd2x 56, 0, %9 \n\t" // store y + "stxvd2x 57, %5, %9 \n\t" + "stxvd2x 58, %6, %9 \n\t" + "stxvd2x 59, %7, %9 \n\t" + + + + : + : + "r" (i), // 0 + "r" (x1), // 1 + "r" (y1), // 2 + "r" (c), // 3 + "r" (s), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (x2), // 8 + "r" (y2) // 9 + : "cr0", "%0", "%1" , "%2", "%8", "%9", "memory" + ); + +} + + diff --git a/kernel/power/srot.c b/kernel/power/srot.c new file mode 100644 index 000000000..d464846a4 --- /dev/null +++ b/kernel/power/srot.c @@ -0,0 +1,167 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/26 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + +#include "common.h" + +#pragma GCC optimize "O1" + +#if defined(POWER8) +#include "srot_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3; + FLOAT x00, x01, x02, x03; + FLOAT g0, g1, g2, g3; + FLOAT y00, y01, y02, y03; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT c1=*c; + FLOAT s1=*s; + + while ( i 0 ) + { + c1[0]=c; + c1[1]=c; + c1[2]=c; + c1[3]=c; + s1[0]=s; + s1[1]=s; + s1[2]=s; + s1[3]=s; + srot_kernel_16(n1, x1, y1, c1, s1); + i=n1; + } + + while(i < n) + { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c new file mode 100644 index 000000000..ade65500f --- /dev/null +++ b/kernel/power/srot_microk_power8.c @@ -0,0 +1,208 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* +* I don't use fused multiply-add ( precision problems with lapack ) +* +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline)); + +static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *x2=x+1; + FLOAT *y2=y+1; + + __asm__ __volatile__ + ( + + "lxvw4x 36 , 0, %3 \n\t" // load c + "lxvw4x 37 , 0, %4 \n\t" // load s + "addi %8 , %8, -4 \n\t" + "addi %9 , %9, -4 \n\t" + + "lxvw4x 32, 0, %1 \n\t" // load x + "lxvw4x 33, %5, %1 \n\t" + "lxvw4x 34, %6, %1 \n\t" + "lxvw4x 35, %7, %1 \n\t" + + "lxvw4x 40, 0, %2 \n\t" // load y + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + + "addi %1, %1, 64 \n\t" + "addi %2, %2, 64 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "xvmulsp 48, 32, 36 \n\t" // c * x + "xvmulsp 49, 33, 36 \n\t" + "xvmulsp 50, 34, 36 \n\t" + "xvmulsp 51, 35, 36 \n\t" + + "xvmulsp 56, 40, 36 \n\t" // c * y + "xvmulsp 57, 41, 36 \n\t" + "xvmulsp 58, 42, 36 \n\t" + "xvmulsp 59, 43, 36 \n\t" + + "xvmulsp 52, 32, 37 \n\t" // s * x + "xvmulsp 53, 33, 37 \n\t" + + "lxvw4x 32, 0, %1 \n\t" // load x + "lxvw4x 33, %5, %1 \n\t" + + "xvmulsp 54, 34, 37 \n\t" + "xvmulsp 55, 35, 37 \n\t" + + "lxvw4x 34, %6, %1 \n\t" + "lxvw4x 35, %7, %1 \n\t" + + "xvmulsp 60, 40, 37 \n\t" // s * y + "xvmulsp 61, 41, 37 \n\t" + + "lxvw4x 40, 0, %2 \n\t" // load y + "lxvw4x 41, %5, %2 \n\t" + + "xvmulsp 62, 42, 37 \n\t" + "xvmulsp 63, 43, 37 \n\t" + + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + + "xvaddsp 48, 48 , 60 \n\t" // c * x + s * y + "xvaddsp 49, 49 , 61 \n\t" // c * x + s * y + + "addi %1, %1, 64 \n\t" + "addi %2, %2, 64 \n\t" + + "xvaddsp 50, 50 , 62 \n\t" // c * x + s * y + "xvaddsp 51, 51 , 63 \n\t" // c * x + s * y + + "xvsubsp 56, 56 , 52 \n\t" // c * y - s * x + "xvsubsp 57, 57 , 53 \n\t" // c * y - s * x + "xvsubsp 58, 58 , 54 \n\t" // c * y - s * x + "xvsubsp 59, 59 , 55 \n\t" // c * y - s * x + + "stxvw4x 48, 0, %8 \n\t" // store x + "stxvw4x 49, %5, %8 \n\t" + "stxvw4x 50, %6, %8 \n\t" + "stxvw4x 51, %7, %8 \n\t" + + "stxvw4x 56, 0, %9 \n\t" // store y + "stxvw4x 57, %5, %9 \n\t" + "stxvw4x 58, %6, %9 \n\t" + "stxvw4x 59, %7, %9 \n\t" + + "addi %8, %8, 64 \n\t" + "addi %9, %9, 64 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmulsp 48, 32, 36 \n\t" // c * x + "xvmulsp 49, 33, 36 \n\t" + "xvmulsp 50, 34, 36 \n\t" + "xvmulsp 51, 35, 36 \n\t" + + "xvmulsp 56, 40, 36 \n\t" // c * y + "xvmulsp 57, 41, 36 \n\t" + "xvmulsp 58, 42, 36 \n\t" + "xvmulsp 59, 43, 36 \n\t" + + "xvmulsp 52, 32, 37 \n\t" // s * x + "xvmulsp 53, 33, 37 \n\t" + "xvmulsp 54, 34, 37 \n\t" + "xvmulsp 55, 35, 37 \n\t" + + "xvmulsp 60, 40, 37 \n\t" // s * y + "xvmulsp 61, 41, 37 \n\t" + "xvmulsp 62, 42, 37 \n\t" + "xvmulsp 63, 43, 37 \n\t" + + "xvaddsp 48, 48 , 60 \n\t" // c * x + s * y + "xvaddsp 49, 49 , 61 \n\t" // c * x + s * y + "xvaddsp 50, 50 , 62 \n\t" // c * x + s * y + "xvaddsp 51, 51 , 63 \n\t" // c * x + s * y + + "xvsubsp 56, 56 , 52 \n\t" // c * y - s * x + "xvsubsp 57, 57 , 53 \n\t" // c * y - s * x + "xvsubsp 58, 58 , 54 \n\t" // c * y - s * x + "xvsubsp 59, 59 , 55 \n\t" // c * y - s * x + + "stxvw4x 48, 0, %8 \n\t" // store x + "stxvw4x 49, %5, %8 \n\t" + "stxvw4x 50, %6, %8 \n\t" + "stxvw4x 51, %7, %8 \n\t" + + "stxvw4x 56, 0, %9 \n\t" // store y + "stxvw4x 57, %5, %9 \n\t" + "stxvw4x 58, %6, %9 \n\t" + "stxvw4x 59, %7, %9 \n\t" + + + + : + : + "r" (i), // 0 + "r" (x1), // 1 + "r" (y1), // 2 + "r" (c), // 3 + "r" (s), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (x2), // 8 + "r" (y2) // 9 + : "cr0", "%0", "%1" , "%2", "%8", "%9", "memory" + ); + +} + + From f1a5dd06c530334f85d3c1b100ce630c347f9907 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 27 Mar 2016 11:05:56 +0200 Subject: [PATCH 27/48] added optimized sscal kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/sscal.c | 179 +++++++++++++++++++++++ kernel/power/sscal_microk_power8.c | 218 +++++++++++++++++++++++++++++ 3 files changed, 398 insertions(+), 1 deletion(-) create mode 100644 kernel/power/sscal.c create mode 100644 kernel/power/sscal_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 99bb38096..dc670d0bd 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -135,7 +135,7 @@ DROTKERNEL = drot.c #CROTKERNEL = ../arm/zrot.c #ZROTKERNEL = ../arm/zrot.c # -#SSCALKERNEL = ../arm/scal.c +SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c #CSCALKERNEL = ../arm/zscal.c #ZSCALKERNEL = zscal.c diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c new file mode 100644 index 000000000..c6ef5e969 --- /dev/null +++ b/kernel/power/sscal.c @@ -0,0 +1,179 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + +#if defined(POWER8) +#include "sscal_microk_power8.c" +#endif + + +#if !defined(HAVE_KERNEL_16) + +static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x ) +{ + + BLASLONG i; + FLOAT alpha = *da; + + for( i=0; i 0 ) + { + alpha[0]=da; + alpha[1]=da; + alpha[2]=da; + alpha[3]=da; + sscal_kernel_16_zero(n1 , alpha , x); + j=n1; + } + + while(j < n) + { + + x[j]=0.0; + j++; + } + + } + else + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + alpha[0]=da; + alpha[1]=da; + alpha[2]=da; + alpha[3]=da; + sscal_kernel_16(n1 , alpha , x); + j=n1; + } + while(j < n) + { + + x[j] = da * x[j] ; + j++; + } + } + + + } + else + { + + if ( da == 0.0 ) + { + + while(j < n) + { + + x[i]=0.0; + i += inc_x ; + j++; + } + + } + else + { + + while(j < n) + { + + x[i] = da * x[i] ; + i += inc_x ; + j++; + } + } + + } + return 0; + +} + + diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c new file mode 100644 index 000000000..963cec777 --- /dev/null +++ b/kernel/power/sscal_microk_power8.c @@ -0,0 +1,218 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *x2=x+1; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "lxvw4x 32, 0, %3 \n\t" + "addi %1, %1, -4 \n\t" + + "dcbt %2, %4 \n\t" + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %4 \n\t" + + "xvmulsp 48, 40, 32 \n\t" + "xvmulsp 49, 41, 32 \n\t" + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "xvmulsp 50, 42, 32 \n\t" + "xvmulsp 51, 43, 32 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "xvmulsp 52, 44, 32 \n\t" + "xvmulsp 53, 45, 32 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "xvmulsp 54, 46, 32 \n\t" + "xvmulsp 55, 47, 32 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "stxvw4x 48, 0, %1 \n\t" + "stxvw4x 49, %5, %1 \n\t" + "stxvw4x 50, %6, %1 \n\t" + "stxvw4x 51, %7, %1 \n\t" + "stxvw4x 52, %8, %1 \n\t" + "stxvw4x 53, %9, %1 \n\t" + "stxvw4x 54, %10, %1 \n\t" + "stxvw4x 55, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmulsp 48, 40, 32 \n\t" + "xvmulsp 49, 41, 32 \n\t" + "xvmulsp 50, 42, 32 \n\t" + "xvmulsp 51, 43, 32 \n\t" + "xvmulsp 52, 44, 32 \n\t" + "xvmulsp 53, 45, 32 \n\t" + "xvmulsp 54, 46, 32 \n\t" + "xvmulsp 55, 47, 32 \n\t" + + "stxvw4x 48, 0, %1 \n\t" + "stxvw4x 49, %5, %1 \n\t" + "stxvw4x 50, %6, %1 \n\t" + "stxvw4x 51, %7, %1 \n\t" + "stxvw4x 52, %8, %1 \n\t" + "stxvw4x 53, %9, %1 \n\t" + "stxvw4x 54, %10, %1 \n\t" + "stxvw4x 55, %11, %1 \n\t" + + : + : + "r" (i), // 0 + "r" (x2), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + +static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *x2=x+1; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "xxlxor 32 , 32 , 32 \n\t" + "addi %1, %1, -4 \n\t" + + + ".align 5 \n\t" + "1: \n\t" + + "stxvw4x 32, 0, %1 \n\t" + "stxvw4x 32, %5, %1 \n\t" + "stxvw4x 32, %6, %1 \n\t" + "stxvw4x 32, %7, %1 \n\t" + "stxvw4x 32, %8, %1 \n\t" + "stxvw4x 32, %9, %1 \n\t" + "stxvw4x 32, %10, %1 \n\t" + "stxvw4x 32, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (x2), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + From 35c98a355613b677134709cfc0ac648147397314 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 27 Mar 2016 16:31:50 +0200 Subject: [PATCH 28/48] added optimized zscal kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/zscal.c | 176 +++++++++++++++++++++++ kernel/power/zscal_microk_power8.c | 224 +++++++++++++++++++++++++++++ 3 files changed, 401 insertions(+), 1 deletion(-) create mode 100644 kernel/power/zscal.c create mode 100644 kernel/power/zscal_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index dc670d0bd..7b5e2c273 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -138,7 +138,7 @@ DROTKERNEL = drot.c SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c #CSCALKERNEL = ../arm/zscal.c -#ZSCALKERNEL = zscal.c +ZSCALKERNEL = zscal.c # SSWAPKERNEL = sswap.c DSWAPKERNEL = dswap.c diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c new file mode 100644 index 000000000..213839a8f --- /dev/null +++ b/kernel/power/zscal.c @@ -0,0 +1,176 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +#include "common.h" + +#pragma GCC optimize "O1" + +#if defined(POWER8) +#include "zscal_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void zscal_kernel_8(BLASLONG n, FLOAT *x, FLOAT *alpha) +{ + + BLASLONG i=0; + FLOAT *x1=x; + FLOAT alpha_r1=alpha[0]; + FLOAT alpha_r2=alpha[1]; + FLOAT alpha_i1=alpha[2]; + FLOAT alpha_i2=alpha[3]; + FLOAT temp00, temp01, temp10, temp11, temp20, temp21, temp30, temp31; + FLOAT x0_r, x0_i, x1_r, x1_i, x2_r, x2_i, x3_r, x3_i; + + while ( i 0 ) + { + alpha[0] = da_r; + alpha[1] = da_r; + alpha[2] = -da_i; + alpha[3] = da_i; + zscal_kernel_8(n1, x, alpha); + i=n1; + ip = n1 * 2; + + } + + while ( i < n ) + { + + temp = da_r * x[ip] - da_i * x[ip+1] ; + x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; + x[ip] = temp; + ip += 2; + i++; + } + + } + else + { + + inc_x2 = 2 * inc_x; + + while ( i < n ) + { + + temp = da_r * x[ip] - da_i * x[ip+1] ; + x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; + x[ip] = temp; + ip += inc_x2; + i++; + } + + + } + + return(0); + +} + + diff --git a/kernel/power/zscal_microk_power8.c b/kernel/power/zscal_microk_power8.c new file mode 100644 index 000000000..5e09d8d79 --- /dev/null +++ b/kernel/power/zscal_microk_power8.c @@ -0,0 +1,224 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/25 Werner Saar (wernsaar@googlemail.com) +* +* I don't use fused multipy-add ( lapack precision problems ) +* +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha) __attribute__ ((noinline)); + +static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *x2=x+1; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "lxvd2x 32, 0, %3 \n\t" // alpha_r , alpha_r + "lxvd2x 33, %5, %3 \n\t" // -alpha_i , alpha_i + "addi %1, %1, -8 \n\t" + + "dcbt %2, %4 \n\t" + + "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %4 \n\t" + + "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmuldp 49, 41, 32 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "xvmuldp 52, 44, 32 \n\t" + "xvmuldp 53, 45, 32 \n\t" + "xvmuldp 54, 46, 32 \n\t" + "xvmuldp 55, 47, 32 \n\t" + + "xxswapd 56, 40 \n\t" + "xxswapd 57, 41 \n\t" + "xxswapd 58, 42 \n\t" + "xxswapd 59, 43 \n\t" + "xxswapd 60, 44 \n\t" + "xxswapd 61, 45 \n\t" + "xxswapd 62, 46 \n\t" + "xxswapd 63, 47 \n\t" + + "xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmuldp 57, 57, 33 \n\t" + + "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i + "lxvd2x 41, %5, %2 \n\t" + + "xvmuldp 58, 58, 33 \n\t" + "xvmuldp 59, 59, 33 \n\t" + + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + + "xvmuldp 60, 60, 33 \n\t" + "xvmuldp 61, 61, 33 \n\t" + + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + + "xvmuldp 62, 62, 33 \n\t" + "xvmuldp 63, 63, 33 \n\t" + + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "xvadddp 48, 48 , 56 \n\t" + "xvadddp 49, 49 , 57 \n\t" + "xvadddp 50, 50 , 58 \n\t" + "xvadddp 51, 51 , 59 \n\t" + + "stxvd2x 48, 0, %1 \n\t" + "stxvd2x 49, %5, %1 \n\t" + + "xvadddp 52, 52 , 60 \n\t" + "xvadddp 53, 53 , 61 \n\t" + + "stxvd2x 50, %6, %1 \n\t" + "stxvd2x 51, %7, %1 \n\t" + + "xvadddp 54, 54 , 62 \n\t" + "xvadddp 55, 55 , 63 \n\t" + + "stxvd2x 52, %8, %1 \n\t" + "stxvd2x 53, %9, %1 \n\t" + "stxvd2x 54, %10, %1 \n\t" + "stxvd2x 55, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r + "xvmuldp 49, 41, 32 \n\t" + "xvmuldp 50, 42, 32 \n\t" + "xvmuldp 51, 43, 32 \n\t" + "xvmuldp 52, 44, 32 \n\t" + "xvmuldp 53, 45, 32 \n\t" + "xvmuldp 54, 46, 32 \n\t" + "xvmuldp 55, 47, 32 \n\t" + + "xxswapd 56, 40 \n\t" + "xxswapd 57, 41 \n\t" + "xxswapd 58, 42 \n\t" + "xxswapd 59, 43 \n\t" + "xxswapd 60, 44 \n\t" + "xxswapd 61, 45 \n\t" + "xxswapd 62, 46 \n\t" + "xxswapd 63, 47 \n\t" + + "xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i + "xvmuldp 57, 57, 33 \n\t" + "xvmuldp 58, 58, 33 \n\t" + "xvmuldp 59, 59, 33 \n\t" + "xvmuldp 60, 60, 33 \n\t" + "xvmuldp 61, 61, 33 \n\t" + "xvmuldp 62, 62, 33 \n\t" + "xvmuldp 63, 63, 33 \n\t" + + "xvadddp 48, 48 , 56 \n\t" + "xvadddp 49, 49 , 57 \n\t" + "xvadddp 50, 50 , 58 \n\t" + "xvadddp 51, 51 , 59 \n\t" + "xvadddp 52, 52 , 60 \n\t" + "xvadddp 53, 53 , 61 \n\t" + "xvadddp 54, 54 , 62 \n\t" + "xvadddp 55, 55 , 63 \n\t" + + "stxvd2x 48, 0, %1 \n\t" + "stxvd2x 49, %5, %1 \n\t" + "stxvd2x 50, %6, %1 \n\t" + "stxvd2x 51, %7, %1 \n\t" + "stxvd2x 52, %8, %1 \n\t" + "stxvd2x 53, %9, %1 \n\t" + "stxvd2x 54, %10, %1 \n\t" + "stxvd2x 55, %11, %1 \n\t" + + + : + : + "r" (i), // 0 + "r" (x2), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "memory" + ); + +} + + From 659ed165918ab4f089a2554fdcdab3a1640abdd2 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 27 Mar 2016 18:31:37 +0200 Subject: [PATCH 29/48] added otimized cswap and zswap kernels for POWER8 --- kernel/power/KERNEL.POWER8 | 4 +- kernel/power/cswap.c | 175 ++++++++++++++++++++++++++++ kernel/power/cswap_microk_power8.c | 180 +++++++++++++++++++++++++++++ kernel/power/zswap.c | 175 ++++++++++++++++++++++++++++ kernel/power/zswap_microk_power8.c | 180 +++++++++++++++++++++++++++++ 5 files changed, 712 insertions(+), 2 deletions(-) create mode 100644 kernel/power/cswap.c create mode 100644 kernel/power/cswap_microk_power8.c create mode 100644 kernel/power/zswap.c create mode 100644 kernel/power/zswap_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 7b5e2c273..8fbceb052 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -142,8 +142,8 @@ ZSCALKERNEL = zscal.c # SSWAPKERNEL = sswap.c DSWAPKERNEL = dswap.c -#CSWAPKERNEL = ../arm/zswap.c -#ZSWAPKERNEL = ../arm/zswap.c +CSWAPKERNEL = cswap.c +ZSWAPKERNEL = zswap.c # #SGEMVNKERNEL = ../arm/gemv_n.c diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c new file mode 100644 index 000000000..da97c896e --- /dev/null +++ b/kernel/power/cswap.c @@ -0,0 +1,175 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + + +#if defined(POWER8) +#include "cswap_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_32 + +static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT g0, g1, g2, g3, g4, g5, g6, g7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + cswap_kernel_32(n1, x, y); + i=n1; + ix = 2* n1; + iy = 2* n1; + } + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += 2 ; + iy += 2 ; + i++ ; + + + } + + + } + else + { + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c new file mode 100644 index 000000000..90ab59c54 --- /dev/null +++ b/kernel/power/cswap_microk_power8.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_32 1 + +static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *x2=x+1; + FLOAT *y2=y+1; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "addi %3, %3, -4 \n\t" + "addi %4, %4, -4 \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "lxvw4x 32, 0, %2 \n\t" + "lxvw4x 33, %5, %2 \n\t" + "lxvw4x 34, %6, %2 \n\t" + "lxvw4x 35, %7, %2 \n\t" + "lxvw4x 36, %8, %2 \n\t" + "lxvw4x 37, %9, %2 \n\t" + "lxvw4x 38, %10, %2 \n\t" + "lxvw4x 39, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvw4x 48, 0, %1 \n\t" + "lxvw4x 49, %5, %1 \n\t" + "lxvw4x 50, %6, %1 \n\t" + "lxvw4x 51, %7, %1 \n\t" + "lxvw4x 52, %8, %1 \n\t" + "lxvw4x 53, %9, %1 \n\t" + "lxvw4x 54, %10, %1 \n\t" + "lxvw4x 55, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "lxvw4x 56, 0, %1 \n\t" + "lxvw4x 57, %5, %1 \n\t" + "lxvw4x 58, %6, %1 \n\t" + "lxvw4x 59, %7, %1 \n\t" + "lxvw4x 60, %8, %1 \n\t" + "lxvw4x 61, %9, %1 \n\t" + "lxvw4x 62, %10, %1 \n\t" + "lxvw4x 63, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "stxvw4x 32, 0, %3 \n\t" + "stxvw4x 33, %5, %3 \n\t" + "stxvw4x 34, %6, %3 \n\t" + "stxvw4x 35, %7, %3 \n\t" + "stxvw4x 36, %8, %3 \n\t" + "stxvw4x 37, %9, %3 \n\t" + "stxvw4x 38, %10, %3 \n\t" + "stxvw4x 39, %11, %3 \n\t" + + "addi %3, %3, 128 \n\t" + + "stxvw4x 40, 0, %3 \n\t" + "stxvw4x 41, %5, %3 \n\t" + "stxvw4x 42, %6, %3 \n\t" + "stxvw4x 43, %7, %3 \n\t" + "stxvw4x 44, %8, %3 \n\t" + "stxvw4x 45, %9, %3 \n\t" + "stxvw4x 46, %10, %3 \n\t" + "stxvw4x 47, %11, %3 \n\t" + + "addi %3, %3, 128 \n\t" + + "stxvw4x 48, 0, %4 \n\t" + "stxvw4x 49, %5, %4 \n\t" + "stxvw4x 50, %6, %4 \n\t" + "stxvw4x 51, %7, %4 \n\t" + "stxvw4x 52, %8, %4 \n\t" + "stxvw4x 53, %9, %4 \n\t" + "stxvw4x 54, %10, %4 \n\t" + "stxvw4x 55, %11, %4 \n\t" + + "addi %4, %4, 128 \n\t" + + "stxvw4x 56, 0, %4 \n\t" + "stxvw4x 57, %5, %4 \n\t" + "stxvw4x 58, %6, %4 \n\t" + "stxvw4x 59, %7, %4 \n\t" + "stxvw4x 60, %8, %4 \n\t" + "stxvw4x 61, %9, %4 \n\t" + "stxvw4x 62, %10, %4 \n\t" + "stxvw4x 63, %11, %4 \n\t" + + "addi %4, %4, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (y2), // 3 + "r" (x2), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory" + ); + +} + + diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c new file mode 100644 index 000000000..5ec1eee2e --- /dev/null +++ b/kernel/power/zswap.c @@ -0,0 +1,175 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" + + +#if defined(POWER8) +#include "zswap_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + + BLASLONG i=0; + FLOAT f0, f1, f2, f3, f4, f5, f6, f7; + FLOAT g0, g1, g2, g3, g4, g5, g6, g7; + FLOAT *x1=x; + FLOAT *y1=y; + + while ( i 0 ) + { + zswap_kernel_16(n1, x, y); + i=n1; + ix = 2* n1; + iy = 2* n1; + } + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += 2 ; + iy += 2 ; + i++ ; + + + } + + + } + else + { + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/power/zswap_microk_power8.c b/kernel/power/zswap_microk_power8.c new file mode 100644 index 000000000..9e5623752 --- /dev/null +++ b/kernel/power/zswap_microk_power8.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/27 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 + +static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); + +static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + FLOAT *y1=y; + FLOAT *x2=x+1; + FLOAT *y2=y+1; + BLASLONG pre = 384; + BLASLONG alpha=0; + + __asm__ __volatile__ + ( + + "addi %3, %3, -8 \n\t" + "addi %4, %4, -8 \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "lxvd2x 32, 0, %2 \n\t" + "lxvd2x 33, %5, %2 \n\t" + "lxvd2x 34, %6, %2 \n\t" + "lxvd2x 35, %7, %2 \n\t" + "lxvd2x 36, %8, %2 \n\t" + "lxvd2x 37, %9, %2 \n\t" + "lxvd2x 38, %10, %2 \n\t" + "lxvd2x 39, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "lxvd2x 48, 0, %1 \n\t" + "lxvd2x 49, %5, %1 \n\t" + "lxvd2x 50, %6, %1 \n\t" + "lxvd2x 51, %7, %1 \n\t" + "lxvd2x 52, %8, %1 \n\t" + "lxvd2x 53, %9, %1 \n\t" + "lxvd2x 54, %10, %1 \n\t" + "lxvd2x 55, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "lxvd2x 56, 0, %1 \n\t" + "lxvd2x 57, %5, %1 \n\t" + "lxvd2x 58, %6, %1 \n\t" + "lxvd2x 59, %7, %1 \n\t" + "lxvd2x 60, %8, %1 \n\t" + "lxvd2x 61, %9, %1 \n\t" + "lxvd2x 62, %10, %1 \n\t" + "lxvd2x 63, %11, %1 \n\t" + + "addi %1, %1, 128 \n\t" + + "stxvd2x 32, 0, %3 \n\t" + "stxvd2x 33, %5, %3 \n\t" + "stxvd2x 34, %6, %3 \n\t" + "stxvd2x 35, %7, %3 \n\t" + "stxvd2x 36, %8, %3 \n\t" + "stxvd2x 37, %9, %3 \n\t" + "stxvd2x 38, %10, %3 \n\t" + "stxvd2x 39, %11, %3 \n\t" + + "addi %3, %3, 128 \n\t" + + "stxvd2x 40, 0, %3 \n\t" + "stxvd2x 41, %5, %3 \n\t" + "stxvd2x 42, %6, %3 \n\t" + "stxvd2x 43, %7, %3 \n\t" + "stxvd2x 44, %8, %3 \n\t" + "stxvd2x 45, %9, %3 \n\t" + "stxvd2x 46, %10, %3 \n\t" + "stxvd2x 47, %11, %3 \n\t" + + "addi %3, %3, 128 \n\t" + + "stxvd2x 48, 0, %4 \n\t" + "stxvd2x 49, %5, %4 \n\t" + "stxvd2x 50, %6, %4 \n\t" + "stxvd2x 51, %7, %4 \n\t" + "stxvd2x 52, %8, %4 \n\t" + "stxvd2x 53, %9, %4 \n\t" + "stxvd2x 54, %10, %4 \n\t" + "stxvd2x 55, %11, %4 \n\t" + + "addi %4, %4, 128 \n\t" + + "stxvd2x 56, 0, %4 \n\t" + "stxvd2x 57, %5, %4 \n\t" + "stxvd2x 58, %6, %4 \n\t" + "stxvd2x 59, %7, %4 \n\t" + "stxvd2x 60, %8, %4 \n\t" + "stxvd2x 61, %9, %4 \n\t" + "stxvd2x 62, %10, %4 \n\t" + "stxvd2x 63, %11, %4 \n\t" + + "addi %4, %4, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + : + : + "r" (i), // 0 + "r" (y1), // 1 + "r" (x1), // 2 + "r" (y2), // 3 + "r" (x2), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory" + ); + +} + + From c53be46d7830691b091f1803fec4781ad7807954 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 28 Mar 2016 12:17:15 +0200 Subject: [PATCH 30/48] added optimized dasum kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/dasum.c | 144 +++++++++++++++++++++++ kernel/power/dasum_microk_power8.c | 177 +++++++++++++++++++++++++++++ 3 files changed, 322 insertions(+), 1 deletion(-) create mode 100644 kernel/power/dasum.c create mode 100644 kernel/power/dasum_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 8fbceb052..68eec2b2a 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -106,7 +106,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #IDMINKERNEL = ../arm/imin.c # #SASUMKERNEL = ../arm/asum.c -#DASUMKERNEL = ../arm/asum.c +DASUMKERNEL = dasum.c #CASUMKERNEL = ../arm/zasum.c #ZASUMKERNEL = ../arm/zasum.c # diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c new file mode 100644 index 000000000..77f5345ba --- /dev/null +++ b/kernel/power/dasum.c @@ -0,0 +1,144 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#if defined(POWER8) +#include "dasum_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec) +{ + + BLASLONG i=0; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + + while ( i< n ) + { + + temp0 = ABS(x[0]); + temp1 = ABS(x[1]); + temp2 = ABS(x[2]); + temp3 = ABS(x[3]); + temp4 = ABS(x[4]); + temp5 = ABS(x[5]); + temp6 = ABS(x[6]); + temp7 = ABS(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + + } + + svec[0] = sum0+sum1+sum2+sum3; + svec[1] = 0.0; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + FLOAT svec[2] __attribute__ ((aligned (16)));; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -16; + if ( n1 > 0 ) + { + + dasum_kernel_16(n1, x, svec); + sumf = svec[0] + svec[1]; + i=n1; + } + + while(i < n) + { + sumf += ABS(x[i]); + i++; + } + + } + else + { + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + + } + return(sumf); +} + + diff --git a/kernel/power/dasum_microk_power8.c b/kernel/power/dasum_microk_power8.c new file mode 100644 index 000000000..cc38c4f7d --- /dev/null +++ b/kernel/power/dasum_microk_power8.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline)); + +static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "dcbt %2 , %4 \n\t" + + "xxlxor 32,32,32 \n\t" + "xxlxor 33,33,33 \n\t" + "xxlxor 34,34,34 \n\t" + "xxlxor 35,35,35 \n\t" + "xxlxor 36,36,36 \n\t" + "xxlxor 37,37,37 \n\t" + "xxlxor 38,38,38 \n\t" + "xxlxor 39,39,39 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2 , %4 \n\t" + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + + "xvabsdp 52, 44 \n\t" + "xvabsdp 53, 45 \n\t" + + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + + "xvabsdp 54, 46 \n\t" + "xvabsdp 55, 47 \n\t" + + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "addi %2, %2, 128 \n\t" + "xvadddp 36, 36, 52 \n\t" + "xvadddp 37, 37, 53 \n\t" + "addic. %0 , %0 , -16 \n\t" + "xvadddp 38, 38, 54 \n\t" + "xvadddp 39, 39, 55 \n\t" + + "bgt 1b \n\t" + + "2: \n\t" + + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + "xvabsdp 52, 44 \n\t" + "xvabsdp 53, 45 \n\t" + "xvabsdp 54, 46 \n\t" + "xvabsdp 55, 47 \n\t" + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "xvadddp 36, 36, 52 \n\t" + "xvadddp 37, 37, 53 \n\t" + "xvadddp 38, 38, 54 \n\t" + "xvadddp 39, 39, 55 \n\t" + + "xvadddp 32, 32, 33 \n\t" + "xvadddp 34, 34, 35 \n\t" + "xvadddp 36, 36, 37 \n\t" + "xvadddp 38, 38, 39 \n\t" + + "xvadddp 32, 32, 34 \n\t" + "xvadddp 36, 36, 38 \n\t" + + "xvadddp 32, 32, 36 \n\t" + + + "stxvd2x 32, 0, %3 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (svec), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2", "memory" + ); + +} + + From f59c9bd6efb329df81238a4da55f343ebd3c11af Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 28 Mar 2016 12:44:25 +0200 Subject: [PATCH 31/48] added optimized sasum kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/sasum.c | 146 ++++++++++++++++++++++++ kernel/power/sasum_microk_power8.c | 177 +++++++++++++++++++++++++++++ 3 files changed, 324 insertions(+), 1 deletion(-) create mode 100644 kernel/power/sasum.c create mode 100644 kernel/power/sasum_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 68eec2b2a..495725822 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -105,7 +105,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #ISMINKERNEL = ../arm/imin.c #IDMINKERNEL = ../arm/imin.c # -#SASUMKERNEL = ../arm/asum.c +SASUMKERNEL = sasum.c DASUMKERNEL = dasum.c #CASUMKERNEL = ../arm/zasum.c #ZASUMKERNEL = ../arm/zasum.c diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c new file mode 100644 index 000000000..43311f2ba --- /dev/null +++ b/kernel/power/sasum.c @@ -0,0 +1,146 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#if defined(POWER8) +#include "sasum_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_32 + +static void sasum_kernel_32(BLASLONG n, FLOAT *x1, FLOAT *svec) +{ + + BLASLONG i=0; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + + while ( i< n ) + { + + temp0 = ABS(x[0]); + temp1 = ABS(x[1]); + temp2 = ABS(x[2]); + temp3 = ABS(x[3]); + temp4 = ABS(x[4]); + temp5 = ABS(x[5]); + temp6 = ABS(x[6]); + temp7 = ABS(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=8; + + } + + svec[0] = sum0+sum1+sum2+sum3; + svec[1] = 0.0; + svec[2] = 0.0; + svec[3] = 0.0; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + FLOAT svec[4] __attribute__ ((aligned (16)));; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -32; + if ( n1 > 0 ) + { + + sasum_kernel_32(n1, x, svec); + sumf = svec[0] + svec[1]+svec[2]+svec[3]; + i=n1; + } + + while(i < n) + { + sumf += ABS(x[i]); + i++; + } + + } + else + { + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + + } + return(sumf); +} + + diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c new file mode 100644 index 000000000..847fffe04 --- /dev/null +++ b/kernel/power/sasum_microk_power8.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_32 1 +static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline)); + +static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "dcbt %2 , %4 \n\t" + + "xxlxor 32,32,32 \n\t" + "xxlxor 33,33,33 \n\t" + "xxlxor 34,34,34 \n\t" + "xxlxor 35,35,35 \n\t" + "xxlxor 36,36,36 \n\t" + "xxlxor 37,37,37 \n\t" + "xxlxor 38,38,38 \n\t" + "xxlxor 39,39,39 \n\t" + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -32 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2 , %4 \n\t" + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + + "xvabssp 52, 44 \n\t" + "xvabssp 53, 45 \n\t" + + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + + "xvabssp 54, 46 \n\t" + "xvabssp 55, 47 \n\t" + + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "addi %2, %2, 128 \n\t" + "xvaddsp 36, 36, 52 \n\t" + "xvaddsp 37, 37, 53 \n\t" + "addic. %0 , %0 , -32 \n\t" + "xvaddsp 38, 38, 54 \n\t" + "xvaddsp 39, 39, 55 \n\t" + + "bgt 1b \n\t" + + "2: \n\t" + + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + "xvabssp 52, 44 \n\t" + "xvabssp 53, 45 \n\t" + "xvabssp 54, 46 \n\t" + "xvabssp 55, 47 \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "xvaddsp 36, 36, 52 \n\t" + "xvaddsp 37, 37, 53 \n\t" + "xvaddsp 38, 38, 54 \n\t" + "xvaddsp 39, 39, 55 \n\t" + + "xvaddsp 32, 32, 33 \n\t" + "xvaddsp 34, 34, 35 \n\t" + "xvaddsp 36, 36, 37 \n\t" + "xvaddsp 38, 38, 39 \n\t" + + "xvaddsp 32, 32, 34 \n\t" + "xvaddsp 36, 36, 38 \n\t" + + "xvaddsp 32, 32, 36 \n\t" + + + "stxvw4x 32, 0, %3 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (svec), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2", "memory" + ); + +} + + From 294f933869e887e242caaa9889d0ef452b4c2b9b Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 28 Mar 2016 13:37:32 +0200 Subject: [PATCH 32/48] added optimized zasum kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/zasum.c | 149 ++++++++++++++++++++++++ kernel/power/zasum_microk_power8.c | 177 +++++++++++++++++++++++++++++ 3 files changed, 327 insertions(+), 1 deletion(-) create mode 100644 kernel/power/zasum.c create mode 100644 kernel/power/zasum_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 495725822..1a7004fe8 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -108,7 +108,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c SASUMKERNEL = sasum.c DASUMKERNEL = dasum.c #CASUMKERNEL = ../arm/zasum.c -#ZASUMKERNEL = ../arm/zasum.c +ZASUMKERNEL = zasum.c # #SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = daxpy.c diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c new file mode 100644 index 000000000..abd6ec08a --- /dev/null +++ b/kernel/power/zasum.c @@ -0,0 +1,149 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#if defined(POWER8) +#include "zasum_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void zasum_kernel_8(BLASLONG n, FLOAT *x1, FLOAT *svec) +{ + + BLASLONG i=0; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + + while ( i< n ) + { + + temp0 = ABS(x[0]); + temp1 = ABS(x[1]); + temp2 = ABS(x[2]); + temp3 = ABS(x[3]); + temp4 = ABS(x[4]); + temp5 = ABS(x[5]); + temp6 = ABS(x[6]); + temp7 = ABS(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=4; + + } + + svec[0] = sum0+sum1+sum2+sum3; + svec[1] = 0.0; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ip=0; + FLOAT sumf = 0.0; + FLOAT svec[2] __attribute__ ((aligned (16)));; + BLASLONG n1; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -8; + if ( n1 > 0 ) + { + + zasum_kernel_8(n1, x, svec); + sumf = svec[0] + svec[1]; + i=n1; + ip=2*n1; + } + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + i++; + ip+=2; + } + + } + else + { + inc_x2 = 2* inc_x; + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + ip+=inc_x2; + i++; + } + + } + return(sumf); +} + + diff --git a/kernel/power/zasum_microk_power8.c b/kernel/power/zasum_microk_power8.c new file mode 100644 index 000000000..b9f6c0ac6 --- /dev/null +++ b/kernel/power/zasum_microk_power8.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline)); + +static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "dcbt %2 , %4 \n\t" + + "xxlxor 32,32,32 \n\t" + "xxlxor 33,33,33 \n\t" + "xxlxor 34,34,34 \n\t" + "xxlxor 35,35,35 \n\t" + "xxlxor 36,36,36 \n\t" + "xxlxor 37,37,37 \n\t" + "xxlxor 38,38,38 \n\t" + "xxlxor 39,39,39 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -8 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2 , %4 \n\t" + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + + "lxvd2x 40, 0, %2 \n\t" + "lxvd2x 41, %5, %2 \n\t" + + "xvabsdp 52, 44 \n\t" + "xvabsdp 53, 45 \n\t" + + "lxvd2x 42, %6, %2 \n\t" + "lxvd2x 43, %7, %2 \n\t" + + "xvabsdp 54, 46 \n\t" + "xvabsdp 55, 47 \n\t" + + "lxvd2x 44, %8, %2 \n\t" + "lxvd2x 45, %9, %2 \n\t" + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + + "lxvd2x 46, %10, %2 \n\t" + "lxvd2x 47, %11, %2 \n\t" + + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "addi %2, %2, 128 \n\t" + "xvadddp 36, 36, 52 \n\t" + "xvadddp 37, 37, 53 \n\t" + "addic. %0 , %0 , -8 \n\t" + "xvadddp 38, 38, 54 \n\t" + "xvadddp 39, 39, 55 \n\t" + + "bgt 1b \n\t" + + "2: \n\t" + + + "xvabsdp 48, 40 \n\t" + "xvabsdp 49, 41 \n\t" + "xvabsdp 50, 42 \n\t" + "xvabsdp 51, 43 \n\t" + "xvabsdp 52, 44 \n\t" + "xvabsdp 53, 45 \n\t" + "xvabsdp 54, 46 \n\t" + "xvabsdp 55, 47 \n\t" + + "xvadddp 32, 32, 48 \n\t" + "xvadddp 33, 33, 49 \n\t" + "xvadddp 34, 34, 50 \n\t" + "xvadddp 35, 35, 51 \n\t" + "xvadddp 36, 36, 52 \n\t" + "xvadddp 37, 37, 53 \n\t" + "xvadddp 38, 38, 54 \n\t" + "xvadddp 39, 39, 55 \n\t" + + "xvadddp 32, 32, 33 \n\t" + "xvadddp 34, 34, 35 \n\t" + "xvadddp 36, 36, 37 \n\t" + "xvadddp 38, 38, 39 \n\t" + + "xvadddp 32, 32, 34 \n\t" + "xvadddp 36, 36, 38 \n\t" + + "xvadddp 32, 32, 36 \n\t" + + + "stxvd2x 32, 0, %3 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (svec), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2", "memory" + ); + +} + + From c2464a7c4a1e36e654bc21aefdec88b56e36ecd5 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 28 Mar 2016 14:12:08 +0200 Subject: [PATCH 33/48] added optimized casum kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/casum.c | 151 ++++++++++++++++++++++++ kernel/power/casum_microk_power8.c | 177 +++++++++++++++++++++++++++++ 3 files changed, 329 insertions(+), 1 deletion(-) create mode 100644 kernel/power/casum.c create mode 100644 kernel/power/casum_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 1a7004fe8..890842e05 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -107,7 +107,7 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c # SASUMKERNEL = sasum.c DASUMKERNEL = dasum.c -#CASUMKERNEL = ../arm/zasum.c +CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c # #SAXPYKERNEL = ../arm/axpy.c diff --git a/kernel/power/casum.c b/kernel/power/casum.c new file mode 100644 index 000000000..aeed0ca78 --- /dev/null +++ b/kernel/power/casum.c @@ -0,0 +1,151 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#if defined(POWER8) +#include "casum_microk_power8.c" +#endif + + +#ifndef HAVE_KERNEL_16 + +static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec) +{ + + BLASLONG i=0; + FLOAT *x = x1; + FLOAT temp0, temp1, temp2, temp3; + FLOAT temp4, temp5, temp6, temp7; + FLOAT sum0 = 0.0; + FLOAT sum1 = 0.0; + FLOAT sum2 = 0.0; + FLOAT sum3 = 0.0; + + while ( i< n ) + { + + temp0 = ABS(x[0]); + temp1 = ABS(x[1]); + temp2 = ABS(x[2]); + temp3 = ABS(x[3]); + temp4 = ABS(x[4]); + temp5 = ABS(x[5]); + temp6 = ABS(x[6]); + temp7 = ABS(x[7]); + + sum0 += temp0; + sum1 += temp1; + sum2 += temp2; + sum3 += temp3; + + sum0 += temp4; + sum1 += temp5; + sum2 += temp6; + sum3 += temp7; + + x+=8; + i+=4; + + } + + svec[0] = sum0+sum1+sum2+sum3; + svec[1] = 0.0; + svec[2] = 0.0; + svec[3] = 0.0; + +} + +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ip=0; + FLOAT sumf = 0.0; + FLOAT svec[4] __attribute__ ((aligned (16)));; + BLASLONG n1; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -16; + if ( n1 > 0 ) + { + + casum_kernel_16(n1, x, svec); + sumf = svec[0] + svec[1]+svec[2]+svec[3]; + i=n1; + ip = 2 * n1; + } + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + ip += 2; + i++; + } + + } + else + { + inc_x2 = 2 * inc_x; + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + ip += inc_x2; + i++; + } + + } + return(sumf); +} + + diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c new file mode 100644 index 000000000..cb50234ce --- /dev/null +++ b/kernel/power/casum_microk_power8.c @@ -0,0 +1,177 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/28 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_16 1 +static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline)); + +static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) +{ + + + BLASLONG i = n; + BLASLONG o16 = 16; + BLASLONG o32 = 32; + BLASLONG o48 = 48; + BLASLONG o64 = 64; + BLASLONG o80 = 80; + BLASLONG o96 = 96; + BLASLONG o112 = 112; + FLOAT *x1=x; + BLASLONG pre = 384; + + __asm__ __volatile__ + ( + + "dcbt %2 , %4 \n\t" + + "xxlxor 32,32,32 \n\t" + "xxlxor 33,33,33 \n\t" + "xxlxor 34,34,34 \n\t" + "xxlxor 35,35,35 \n\t" + "xxlxor 36,36,36 \n\t" + "xxlxor 37,37,37 \n\t" + "xxlxor 38,38,38 \n\t" + "xxlxor 39,39,39 \n\t" + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "addi %2, %2, 128 \n\t" + + "addic. %0 , %0 , -16 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2 , %4 \n\t" + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + + "lxvw4x 40, 0, %2 \n\t" + "lxvw4x 41, %5, %2 \n\t" + + "xvabssp 52, 44 \n\t" + "xvabssp 53, 45 \n\t" + + "lxvw4x 42, %6, %2 \n\t" + "lxvw4x 43, %7, %2 \n\t" + + "xvabssp 54, 46 \n\t" + "xvabssp 55, 47 \n\t" + + "lxvw4x 44, %8, %2 \n\t" + "lxvw4x 45, %9, %2 \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + + "lxvw4x 46, %10, %2 \n\t" + "lxvw4x 47, %11, %2 \n\t" + + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "addi %2, %2, 128 \n\t" + "xvaddsp 36, 36, 52 \n\t" + "xvaddsp 37, 37, 53 \n\t" + "addic. %0 , %0 , -16 \n\t" + "xvaddsp 38, 38, 54 \n\t" + "xvaddsp 39, 39, 55 \n\t" + + "bgt 1b \n\t" + + "2: \n\t" + + + "xvabssp 48, 40 \n\t" + "xvabssp 49, 41 \n\t" + "xvabssp 50, 42 \n\t" + "xvabssp 51, 43 \n\t" + "xvabssp 52, 44 \n\t" + "xvabssp 53, 45 \n\t" + "xvabssp 54, 46 \n\t" + "xvabssp 55, 47 \n\t" + + "xvaddsp 32, 32, 48 \n\t" + "xvaddsp 33, 33, 49 \n\t" + "xvaddsp 34, 34, 50 \n\t" + "xvaddsp 35, 35, 51 \n\t" + "xvaddsp 36, 36, 52 \n\t" + "xvaddsp 37, 37, 53 \n\t" + "xvaddsp 38, 38, 54 \n\t" + "xvaddsp 39, 39, 55 \n\t" + + "xvaddsp 32, 32, 33 \n\t" + "xvaddsp 34, 34, 35 \n\t" + "xvaddsp 36, 36, 37 \n\t" + "xvaddsp 38, 38, 39 \n\t" + + "xvaddsp 32, 32, 34 \n\t" + "xvaddsp 36, 36, 38 \n\t" + + "xvaddsp 32, 32, 36 \n\t" + + + "stxvw4x 32, 0, %3 \n\t" + + : + : + "r" (i), // 0 + "r" (n), // 1 + "r" (x1), // 2 + "r" (svec), // 3 + "r" (pre), // 4 + "r" (o16), // 5 + "r" (o32), // 6 + "r" (o48), // 7 + "r" (o64), // 8 + "r" (o80), // 9 + "r" (o96), // 10 + "r" (o112) // 11 + : "cr0", "%0", "%2", "memory" + ); + +} + + From 68a69c5b50628fa3d2cb8180b591847a018bd33a Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 30 Mar 2016 11:10:53 +0200 Subject: [PATCH 34/48] added optimized dgemv_n kernel for POWER8 --- kernel/power/KERNEL.POWER8 | 2 +- kernel/power/dgemv_n.c | 426 +++++++++++++++++++++++++++ kernel/power/dgemv_n_microk_power8.c | 301 +++++++++++++++++++ 3 files changed, 728 insertions(+), 1 deletion(-) create mode 100644 kernel/power/dgemv_n.c create mode 100644 kernel/power/dgemv_n_microk_power8.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 890842e05..b37a4213b 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -147,7 +147,7 @@ ZSWAPKERNEL = zswap.c # #SGEMVNKERNEL = ../arm/gemv_n.c -#DGEMVNKERNEL = ../arm/gemv_n.c +DGEMVNKERNEL = dgemv_n.c #CGEMVNKERNEL = ../arm/zgemv_n.c #ZGEMVNKERNEL = ../arm/zgemv_n.c # diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c new file mode 100644 index 000000000..812d09d15 --- /dev/null +++ b/kernel/power/dgemv_n.c @@ -0,0 +1,426 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/30 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + + +#include "common.h" + + +#if defined(POWER8) +#include "dgemv_n_microk_power8.c" +#endif + + +#define NBMAX 4096 + +#ifndef HAVE_KERNEL_4x4 + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + FLOAT x[4] __attribute__ ((aligned (16)));; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i<4; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + } +} + +#endif + +#ifndef HAVE_KERNEL_4x2 + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1; + FLOAT x[4] __attribute__ ((aligned (16)));; + a0 = ap[0]; + a1 = ap[1]; + + for ( i=0; i<2; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; + } +} + + +#endif + +#ifndef HAVE_KERNEL_4x1 + +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0; + FLOAT x[4] __attribute__ ((aligned (16)));; + a0 = ap; + + for ( i=0; i<1; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0]; + y[i+1] += a0[i+1]*x[0]; + y[i+2] += a0[i+2]*x[0]; + y[i+3] += a0[i+3]*x[0]; + } +} + + +#endif + + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + if ( inc_dest != 1 ) + { + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,alpha_r); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,alpha_r); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha_r); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha_r); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha_r); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/power/dgemv_n_microk_power8.c b/kernel/power/dgemv_n_microk_power8.c new file mode 100644 index 000000000..9eabe555c --- /dev/null +++ b/kernel/power/dgemv_n_microk_power8.c @@ -0,0 +1,301 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/30 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#define HAVE_KERNEL_4x4 1 + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i=n; + BLASLONG o8 = 8; + BLASLONG o16 = 16; + BLASLONG o24 = 24; + BLASLONG pre = 384; + + FLOAT *a0,*a1,*a2,*a3; + FLOAT *y1=y+1; + FLOAT x[4] __attribute__ ((aligned (16)));; + a0 = ap[0]+1; + a1 = ap[1]+1; + a2 = ap[2]+1; + a3 = ap[3]+1; + + x[0]=xo[0] * *alpha; + x[1]=xo[1] * *alpha; + x[2]=xo[2] * *alpha; + x[3]=xo[3] * *alpha; + + + __asm__ __volatile__ + ( + "lxvdsx 32, 0 , %1 \n\t" // x0 + "lxvdsx 33,%3 , %1 \n\t" // x1 + "lxvdsx 34,%4 , %1 \n\t" // x2 + "lxvdsx 35,%5 , %1 \n\t" // x3 + "addi %2 , %2 , -8 \n\t" + "addi %6 , %6 , -8 \n\t" + "addi %7 , %7 , -8 \n\t" + "addi %8 , %8 , -8 \n\t" + "addi %9 , %9 , -8 \n\t" + + "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] + "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] + + "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] + "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] + + "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] + "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] + + "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] + "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] + + "addi %6, %6, 32 \n\t" + "addi %7, %7, 32 \n\t" + "addi %8, %8, 32 \n\t" + "addi %9, %9, 32 \n\t" + + "addic. %0 , %0 , -4 \n\t" + "ble 2f \n\t" + + ".align 5 \n\t" + "1: \n\t" + + "dcbt %2, %10 \n\t" + + "lxvd2x 40, 0, %2 \n\t" // y0, y1 + "lxvd2x 41,%4, %2 \n\t" // y2, y3 + + "dcbt %6, %10 \n\t" + "dcbt %7, %10 \n\t" + "dcbt %8, %10 \n\t" + "dcbt %9, %10 \n\t" + + "xvmaddadp 40, 48, 32 \n\t" + "xvmaddadp 41, 49, 32 \n\t" + + "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] + "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] + + "xvmaddadp 40, 50, 33 \n\t" + "addi %6, %6, 32 \n\t" + "xvmaddadp 41, 51, 33 \n\t" + + "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] + "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] + + "xvmaddadp 40, 52, 34 \n\t" + "addi %7, %7, 32 \n\t" + "xvmaddadp 41, 53, 34 \n\t" + + "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] + "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] + + "xvmaddadp 40, 54, 35 \n\t" + "addi %8, %8, 32 \n\t" + "xvmaddadp 41, 55, 35 \n\t" + + "stxvd2x 40, 0, %2 \n\t" // y0, y1 + "stxvd2x 41,%4, %2 \n\t" // y2, y3 + + "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] + "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] + + "addi %9, %9, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %0 , %0 , -4 \n\t" + "ble 2f \n\t" + + + "lxvd2x 40, 0, %2 \n\t" // y0, y1 + "lxvd2x 41,%4, %2 \n\t" // y2, y3 + + "xvmaddadp 40, 48, 32 \n\t" + "xvmaddadp 41, 49, 32 \n\t" + + "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] + "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] + + "xvmaddadp 40, 50, 33 \n\t" + "addi %6, %6, 32 \n\t" + "xvmaddadp 41, 51, 33 \n\t" + + "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] + "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] + + "xvmaddadp 40, 52, 34 \n\t" + "addi %7, %7, 32 \n\t" + "xvmaddadp 41, 53, 34 \n\t" + + "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] + "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] + + "xvmaddadp 40, 54, 35 \n\t" + "addi %8, %8, 32 \n\t" + "xvmaddadp 41, 55, 35 \n\t" + + "stxvd2x 40, 0, %2 \n\t" // y0, y1 + "stxvd2x 41,%4, %2 \n\t" // y2, y3 + + "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] + "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] + + "addi %9, %9, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %0 , %0 , -4 \n\t" + "ble 2f \n\t" + + + "lxvd2x 40, 0, %2 \n\t" // y0, y1 + "lxvd2x 41,%4, %2 \n\t" // y2, y3 + + "xvmaddadp 40, 48, 32 \n\t" + "xvmaddadp 41, 49, 32 \n\t" + + "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] + "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] + + "xvmaddadp 40, 50, 33 \n\t" + "addi %6, %6, 32 \n\t" + "xvmaddadp 41, 51, 33 \n\t" + + "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] + "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] + + "xvmaddadp 40, 52, 34 \n\t" + "addi %7, %7, 32 \n\t" + "xvmaddadp 41, 53, 34 \n\t" + + "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] + "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] + + "xvmaddadp 40, 54, 35 \n\t" + "addi %8, %8, 32 \n\t" + "xvmaddadp 41, 55, 35 \n\t" + + "stxvd2x 40, 0, %2 \n\t" // y0, y1 + "stxvd2x 41,%4, %2 \n\t" // y2, y3 + + "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] + "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] + + "addi %9, %9, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %0 , %0 , -4 \n\t" + "ble 2f \n\t" + + + "lxvd2x 40, 0, %2 \n\t" // y0, y1 + "lxvd2x 41,%4, %2 \n\t" // y2, y3 + + "xvmaddadp 40, 48, 32 \n\t" + "xvmaddadp 41, 49, 32 \n\t" + + "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] + "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] + + "xvmaddadp 40, 50, 33 \n\t" + "addi %6, %6, 32 \n\t" + "xvmaddadp 41, 51, 33 \n\t" + + "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] + "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] + + "xvmaddadp 40, 52, 34 \n\t" + "addi %7, %7, 32 \n\t" + "xvmaddadp 41, 53, 34 \n\t" + + "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] + "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] + + "xvmaddadp 40, 54, 35 \n\t" + "addi %8, %8, 32 \n\t" + "xvmaddadp 41, 55, 35 \n\t" + + "stxvd2x 40, 0, %2 \n\t" // y0, y1 + "stxvd2x 41,%4, %2 \n\t" // y2, y3 + + "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] + "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] + + "addi %9, %9, 32 \n\t" + "addi %2, %2, 32 \n\t" + + "addic. %0 , %0 , -4 \n\t" + "bgt 1b \n\t" + + "2: \n\t" + + "lxvd2x 40, 0, %2 \n\t" // y0, y1 + "lxvd2x 41,%4, %2 \n\t" // y2, y3 + + "xvmaddadp 40, 48, 32 \n\t" + "xvmaddadp 41, 49, 32 \n\t" + + "xvmaddadp 40, 50, 33 \n\t" + "xvmaddadp 41, 51, 33 \n\t" + + "xvmaddadp 40, 52, 34 \n\t" + "xvmaddadp 41, 53, 34 \n\t" + + "xvmaddadp 40, 54, 35 \n\t" + "xvmaddadp 41, 55, 35 \n\t" + + "stxvd2x 40, 0, %2 \n\t" // y0, y1 + "stxvd2x 41,%4, %2 \n\t" // y2, y3 + + : + : + "r" (i), // 0 + "r" (x), // 1 + "r" (y1), // 2 + "r" (o8), // 3 + "r" (o16), // 4 + "r" (o24), // 5 + "r" (a0), // 6 + "r" (a1), // 7 + "r" (a2), // 8 + "r" (a3), // 9 + "r" (pre) // 10 + : "cr0", "%0", "%2" , "%6", "%7", "%8", "%9", "memory" + ); + +} + + From 7aac0aff8ec22598734f5effa2fcc107c95c0655 Mon Sep 17 00:00:00 2001 From: Jerome Robert Date: Thu, 31 Mar 2016 23:03:52 +0200 Subject: [PATCH 35/48] Allow to force to do not use -j as make argument Close #828 (hopefully) --- Makefile.rule | 5 ++++- getarch.c | 5 +++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 847ee1ca8..6d0d8f4e5 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -112,7 +112,10 @@ NO_AFFINITY = 1 # NO_PARALLEL_MAKE = 1 # Force number of make jobs. The default is the number of logical CPU of the host. -# This is particularly useful when using distcc +# This is particularly useful when using distcc. +# A negative value will disable adding a -j flag to make, allowing to use a parent +# make -j value. This is usefull to call OpenBLAS make from an other project +# makefile # MAKE_NB_JOBS = 2 # If you would like to know minute performance report of GotoBLAS. diff --git a/getarch.c b/getarch.c index f9c49e663..1e0b08675 100644 --- a/getarch.c +++ b/getarch.c @@ -1013,7 +1013,12 @@ int main(int argc, char *argv[]){ #endif #ifdef MAKE_NB_JOBS + #if MAKE_NB_JOBS > 0 printf("MAKE += -j %d\n", MAKE_NB_JOBS); + #else + // Let make use parent -j argument or -j1 if there + // is no make parent + #endif #elif NO_PARALLEL_MAKE==1 printf("MAKE += -j 1\n"); #else From dd7612358dd68bd0db90ea9b334c2a3dbc7c62c4 Mon Sep 17 00:00:00 2001 From: Erik Schnetter Date: Fri, 1 Apr 2016 13:49:33 -0400 Subject: [PATCH 36/48] Correct small typo in comment --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 6d0d8f4e5..27aa5a539 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -114,7 +114,7 @@ NO_AFFINITY = 1 # Force number of make jobs. The default is the number of logical CPU of the host. # This is particularly useful when using distcc. # A negative value will disable adding a -j flag to make, allowing to use a parent -# make -j value. This is usefull to call OpenBLAS make from an other project +# make -j value. This is useful to call OpenBLAS make from an other project # makefile # MAKE_NB_JOBS = 2 From 6a9bbfc227314547024a4931f16295a7101849c0 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sat, 2 Apr 2016 17:16:36 +0200 Subject: [PATCH 37/48] updated sgemm- and strmm-kernel for POWER8 --- kernel/power/sgemm_kernel_16x8_power8.S | 30 +- kernel/power/sgemm_logic_16x8_power8.S | 1085 +++-- kernel/power/sgemm_macros_16x8_power8.S | 4967 +++++-------------- kernel/power/strmm_kernel_16x8_power8.S | 24 +- kernel/power/strmm_logic_16x8_power8.S | 895 ++-- kernel/power/strmm_macros_16x8_power8.S | 5840 +++++++++++++++++++++++ param.h | 8 +- 7 files changed, 8217 insertions(+), 4632 deletions(-) create mode 100644 kernel/power/strmm_macros_16x8_power8.S diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S index 031f342ad..c2dc1f651 100644 --- a/kernel/power/sgemm_kernel_16x8_power8.S +++ b/kernel/power/sgemm_kernel_16x8_power8.S @@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ @@ -128,17 +128,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define alpha_r vs30 +#define alpha_vr vs31 #define o0 0 -#define TBUFFER r14 +#define BBUFFER r14 #define o4 r15 #define o12 r16 #define o8 r17 #define L r18 #define T1 r19 #define KK r20 -#define BB r21 +#define BBO r21 #define I r22 #define J r23 #define AO r24 @@ -256,11 +257,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmpwi cr0, M, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, N, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, K, 0 - ble .L999_H1 + ble L999_H1 li PRE, 256 li o4 , 4 @@ -269,18 +270,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. li o16, 16 li o32, 32 li o48, 48 - addi TBUFFER, SP, 320 + + li T1, 256 + slwi T1, T1, 9 // 131072 + sub BBUFFER, A, T1 // temp buffer for B unrolled addi T1, SP, 300 - stfs f1, 0(T1) + stxsspx f1, o0 , T1 + stxsspx f1, o4 , T1 + stxsspx f1, o8 , T1 + stxsspx f1, o12 , T1 - lxsspx alpha_r, 0, T1 + lxsspx alpha_r, o0, T1 + lxvw4x alpha_vr, o0, T1 #include "sgemm_logic_16x8_power8.S" -.L999: +L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/sgemm_logic_16x8_power8.S b/kernel/power/sgemm_logic_16x8_power8.S index 0ae6413ce..06bb79ea3 100644 --- a/kernel/power/sgemm_logic_16x8_power8.S +++ b/kernel/power/sgemm_logic_16x8_power8.S @@ -26,94 +26,149 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ srawi. J, N, 3 - ble .LSGEMM_L8_END + ble SGEMM_L8_END -.LSGEMM_L8_BEGIN: +SGEMM_L8_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 3 + +SGEMM_L8_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge SGEMM_L8_COPYB mr CO, C mr AO, A slwi T1, LDC , 3 add C, C, T1 srawi. I, M, 4 - ble .LSGEMM_L8x16_END + ble SGEMM_L8x16_END -.LSGEMM_L8x16_BEGIN: +SGEMM_L8x16_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L8x16_SUB0 + ble SGEMM_L8x16_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L8x16_SUB4 + ble SGEMM_L8x16_SUB4 -.LSGEMM_L8x16_LOOP_START: +SGEMM_L8x16_LOOP_START: dcbt AO, PRE + dcbt BO, PRE LOAD8x16_1 + dcbt BO, PRE KERNEL8x16_I1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 addic. L, L, -2 - ble .LSGEMM_L8x16_LOOP_END + ble SGEMM_L8x16_LOOP_END .align 5 -.LSGEMM_L8x16_LOOP: +SGEMM_L8x16_LOOP: + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 addic. L, L, -1 - bgt .LSGEMM_L8x16_LOOP + bgt SGEMM_L8x16_LOOP -.LSGEMM_L8x16_LOOP_END: +SGEMM_L8x16_LOOP_END: + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 + dcbt BO, PRE KERNEL8x16_1 + dcbt BO, PRE dcbt AO, PRE KERNEL8x16_2 KERNEL8x16_1 KERNEL8x16_E2 - b .LSGEMM_L8x16_SUB1 + b SGEMM_L8x16_SUB1 -.LSGEMM_L8x16_SUB4: +SGEMM_L8x16_SUB4: dcbt AO, PRE KERNEL8x16_SUBI1 @@ -127,53 +182,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x16_SUB1 KERNEL8x16_SUB1 - b .LSGEMM_L8x16_SUB1 + b SGEMM_L8x16_SUB1 -.LSGEMM_L8x16_SUB0: +SGEMM_L8x16_SUB0: andi. L, K, 7 KERNEL8x16_SUBI1 addic. L, L, -1 - ble .LSGEMM_L8x16_SAVE - b .LSGEMM_L8x16_SUB2 + ble SGEMM_L8x16_SAVE + b SGEMM_L8x16_SUB2 -.LSGEMM_L8x16_SUB1: +SGEMM_L8x16_SUB1: andi. L, K, 7 - ble .LSGEMM_L8x16_SAVE + ble SGEMM_L8x16_SAVE -.LSGEMM_L8x16_SUB2: +SGEMM_L8x16_SUB2: KERNEL8x16_SUB1 addic. L, L, -1 - bgt .LSGEMM_L8x16_SUB2 + bgt SGEMM_L8x16_SUB2 -.LSGEMM_L8x16_SAVE: +SGEMM_L8x16_SAVE: SAVE8x16 addic. I, I, -1 - bgt .LSGEMM_L8x16_BEGIN + bgt SGEMM_L8x16_BEGIN -.LSGEMM_L8x16_END: +SGEMM_L8x16_END: -.LSGEMM_L8x8_BEGIN: +SGEMM_L8x8_BEGIN: andi. T2, M, 15 - ble .LSGEMM_L8x1_END + ble SGEMM_L8x1_END andi. T1, M, 8 - ble .LSGEMM_L8x8_END - mr BO, B + ble SGEMM_L8x8_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L8x8_SUB0 + ble SGEMM_L8x8_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L8x8_SUB4 + ble SGEMM_L8x8_SUB4 -.LSGEMM_L8x8_LOOP_START: +SGEMM_L8x8_LOOP_START: LOAD8x8_1 KERNEL8x8_I1 @@ -187,11 +242,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_2 addic. L, L, -2 - ble .LSGEMM_L8x8_LOOP_END + ble SGEMM_L8x8_LOOP_END .align 5 -.LSGEMM_L8x8_LOOP: +SGEMM_L8x8_LOOP: KERNEL8x8_1 KERNEL8x8_2 @@ -204,9 +259,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_2 addic. L, L, -1 - bgt .LSGEMM_L8x8_LOOP + bgt SGEMM_L8x8_LOOP -.LSGEMM_L8x8_LOOP_END: +SGEMM_L8x8_LOOP_END: KERNEL8x8_1 KERNEL8x8_2 @@ -218,9 +273,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_1 KERNEL8x8_E2 - b .LSGEMM_L8x8_SUB1 + b SGEMM_L8x8_SUB1 -.LSGEMM_L8x8_SUB4: +SGEMM_L8x8_SUB4: KERNEL8x8_SUBI1 KERNEL8x8_SUB1 @@ -232,48 +287,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_SUB1 KERNEL8x8_SUB1 - b .LSGEMM_L8x8_SUB1 + b SGEMM_L8x8_SUB1 -.LSGEMM_L8x8_SUB0: +SGEMM_L8x8_SUB0: andi. L, K, 7 KERNEL8x8_SUBI1 addic. L, L, -1 - ble .LSGEMM_L8x8_SAVE - b .LSGEMM_L8x8_SUB2 + ble SGEMM_L8x8_SAVE + b SGEMM_L8x8_SUB2 -.LSGEMM_L8x8_SUB1: +SGEMM_L8x8_SUB1: andi. L, K, 7 - ble .LSGEMM_L8x8_SAVE + ble SGEMM_L8x8_SAVE -.LSGEMM_L8x8_SUB2: +SGEMM_L8x8_SUB2: KERNEL8x8_SUB1 addic. L, L, -1 - bgt .LSGEMM_L8x8_SUB2 + bgt SGEMM_L8x8_SUB2 -.LSGEMM_L8x8_SAVE: +SGEMM_L8x8_SAVE: SAVE8x8 -.LSGEMM_L8x8_END: +SGEMM_L8x8_END: -.LSGEMM_L8x4_BEGIN: +SGEMM_L8x4_BEGIN: andi. T1, M, 4 - ble .LSGEMM_L8x4_END - mr BO, B + ble SGEMM_L8x4_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L8x4_SUB0 + ble SGEMM_L8x4_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L8x4_SUB4 + ble SGEMM_L8x4_SUB4 -.LSGEMM_L8x4_LOOP_START: +SGEMM_L8x4_LOOP_START: LOAD8x4_1 KERNEL8x4_I1 @@ -287,11 +342,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_2 addic. L, L, -2 - ble .LSGEMM_L8x4_LOOP_END + ble SGEMM_L8x4_LOOP_END .align 5 -.LSGEMM_L8x4_LOOP: +SGEMM_L8x4_LOOP: KERNEL8x4_1 KERNEL8x4_2 @@ -304,9 +359,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_2 addic. L, L, -1 - bgt .LSGEMM_L8x4_LOOP + bgt SGEMM_L8x4_LOOP -.LSGEMM_L8x4_LOOP_END: +SGEMM_L8x4_LOOP_END: KERNEL8x4_1 KERNEL8x4_2 @@ -318,9 +373,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_1 KERNEL8x4_E2 - b .LSGEMM_L8x4_SUB1 + b SGEMM_L8x4_SUB1 -.LSGEMM_L8x4_SUB4: +SGEMM_L8x4_SUB4: KERNEL8x4_SUBI1 KERNEL8x4_SUB1 @@ -332,48 +387,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_SUB1 KERNEL8x4_SUB1 - b .LSGEMM_L8x4_SUB1 + b SGEMM_L8x4_SUB1 -.LSGEMM_L8x4_SUB0: +SGEMM_L8x4_SUB0: andi. L, K, 7 KERNEL8x4_SUBI1 addic. L, L, -1 - ble .LSGEMM_L8x4_SAVE - b .LSGEMM_L8x4_SUB2 + ble SGEMM_L8x4_SAVE + b SGEMM_L8x4_SUB2 -.LSGEMM_L8x4_SUB1: +SGEMM_L8x4_SUB1: andi. L, K, 7 - ble .LSGEMM_L8x4_SAVE + ble SGEMM_L8x4_SAVE -.LSGEMM_L8x4_SUB2: +SGEMM_L8x4_SUB2: KERNEL8x4_SUB1 addic. L, L, -1 - bgt .LSGEMM_L8x4_SUB2 + bgt SGEMM_L8x4_SUB2 -.LSGEMM_L8x4_SAVE: +SGEMM_L8x4_SAVE: SAVE8x4 -.LSGEMM_L8x4_END: +SGEMM_L8x4_END: -.LSGEMM_L8x2_BEGIN: +SGEMM_L8x2_BEGIN: andi. T1, M, 2 - ble .LSGEMM_L8x2_END - mr BO, B + ble SGEMM_L8x2_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L8x2_SUB0 + ble SGEMM_L8x2_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L8x2_SUB4 + ble SGEMM_L8x2_SUB4 -.LSGEMM_L8x2_LOOP_START: +SGEMM_L8x2_LOOP_START: LOAD8x2_1 KERNEL8x2_I1 @@ -387,11 +442,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_2 addic. L, L, -2 - ble .LSGEMM_L8x2_LOOP_END + ble SGEMM_L8x2_LOOP_END .align 5 -.LSGEMM_L8x2_LOOP: +SGEMM_L8x2_LOOP: KERNEL8x2_1 KERNEL8x2_2 @@ -404,9 +459,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_2 addic. L, L, -1 - bgt .LSGEMM_L8x2_LOOP + bgt SGEMM_L8x2_LOOP -.LSGEMM_L8x2_LOOP_END: +SGEMM_L8x2_LOOP_END: KERNEL8x2_1 KERNEL8x2_2 @@ -418,9 +473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_1 KERNEL8x2_E2 - b .LSGEMM_L8x2_SUB1 + b SGEMM_L8x2_SUB1 -.LSGEMM_L8x2_SUB4: +SGEMM_L8x2_SUB4: KERNEL8x2_SUBI1 KERNEL8x2_SUB1 @@ -432,48 +487,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_SUB1 KERNEL8x2_SUB1 - b .LSGEMM_L8x2_SUB1 + b SGEMM_L8x2_SUB1 -.LSGEMM_L8x2_SUB0: +SGEMM_L8x2_SUB0: andi. L, K, 7 KERNEL8x2_SUBI1 addic. L, L, -1 - ble .LSGEMM_L8x2_SAVE - b .LSGEMM_L8x2_SUB2 + ble SGEMM_L8x2_SAVE + b SGEMM_L8x2_SUB2 -.LSGEMM_L8x2_SUB1: +SGEMM_L8x2_SUB1: andi. L, K, 7 - ble .LSGEMM_L8x2_SAVE + ble SGEMM_L8x2_SAVE -.LSGEMM_L8x2_SUB2: +SGEMM_L8x2_SUB2: KERNEL8x2_SUB1 addic. L, L, -1 - bgt .LSGEMM_L8x2_SUB2 + bgt SGEMM_L8x2_SUB2 -.LSGEMM_L8x2_SAVE: +SGEMM_L8x2_SAVE: SAVE8x2 -.LSGEMM_L8x2_END: +SGEMM_L8x2_END: -.LSGEMM_L8x1_BEGIN: +SGEMM_L8x1_BEGIN: andi. T1, M, 1 - ble .LSGEMM_L8x1_END - mr BO, B + ble SGEMM_L8x1_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L8x1_SUB0 + ble SGEMM_L8x1_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L8x1_SUB4 + ble SGEMM_L8x1_SUB4 -.LSGEMM_L8x1_LOOP_START: +SGEMM_L8x1_LOOP_START: LOAD8x1_1 KERNEL8x1_I1 @@ -487,11 +542,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_2 addic. L, L, -2 - ble .LSGEMM_L8x1_LOOP_END + ble SGEMM_L8x1_LOOP_END .align 5 -.LSGEMM_L8x1_LOOP: +SGEMM_L8x1_LOOP: KERNEL8x1_1 KERNEL8x1_2 @@ -504,9 +559,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_2 addic. L, L, -1 - bgt .LSGEMM_L8x1_LOOP + bgt SGEMM_L8x1_LOOP -.LSGEMM_L8x1_LOOP_END: +SGEMM_L8x1_LOOP_END: KERNEL8x1_1 KERNEL8x1_2 @@ -518,9 +573,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_1 KERNEL8x1_E2 - b .LSGEMM_L8x1_SUB1 + b SGEMM_L8x1_SUB1 -.LSGEMM_L8x1_SUB4: +SGEMM_L8x1_SUB4: KERNEL8x1_SUBI1 KERNEL8x1_SUB1 @@ -532,74 +587,106 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_SUB1 KERNEL8x1_SUB1 - b .LSGEMM_L8x1_SUB1 + b SGEMM_L8x1_SUB1 -.LSGEMM_L8x1_SUB0: +SGEMM_L8x1_SUB0: andi. L, K, 7 KERNEL8x1_SUBI1 addic. L, L, -1 - ble .LSGEMM_L8x1_SAVE - b .LSGEMM_L8x1_SUB2 + ble SGEMM_L8x1_SAVE + b SGEMM_L8x1_SUB2 -.LSGEMM_L8x1_SUB1: +SGEMM_L8x1_SUB1: andi. L, K, 7 - ble .LSGEMM_L8x1_SAVE + ble SGEMM_L8x1_SAVE -.LSGEMM_L8x1_SUB2: +SGEMM_L8x1_SUB2: KERNEL8x1_SUB1 addic. L, L, -1 - bgt .LSGEMM_L8x1_SUB2 + bgt SGEMM_L8x1_SUB2 -.LSGEMM_L8x1_SAVE: +SGEMM_L8x1_SAVE: SAVE8x1 -.LSGEMM_L8x1_END: +SGEMM_L8x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 - bgt .LSGEMM_L8_BEGIN + bgt SGEMM_L8_BEGIN andi. T2, N, 7 - ble .L999 + ble L999 + +SGEMM_L8_END: -.LSGEMM_L8_END: + b SGEMM_L4_BEGIN - b .LSGEMM_L4_BEGIN +L999_H1: -.L999_H1: + b L999 - b .L999 +SGEMM_L4_BEGIN: -.LSGEMM_L4_BEGIN: + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 2 + +SGEMM_L4_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge SGEMM_L4_COPYB andi. T1, N, 4 - ble .LSGEMM_L4_END + ble SGEMM_L4_END mr CO, C mr AO, A slwi T1, LDC , 2 add C, C, T1 srawi. I, M, 4 - ble .LSGEMM_L4x16_END + ble SGEMM_L4x16_END -.LSGEMM_L4x16_BEGIN: +SGEMM_L4x16_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L4x16_SUB0 + ble SGEMM_L4x16_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L4x16_SUB4 + ble SGEMM_L4x16_SUB4 -.LSGEMM_L4x16_LOOP_START: +SGEMM_L4x16_LOOP_START: dcbt AO, PRE LOAD4x16_1 @@ -618,11 +705,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_2 addic. L, L, -2 - ble .LSGEMM_L4x16_LOOP_END + ble SGEMM_L4x16_LOOP_END .align 5 -.LSGEMM_L4x16_LOOP: +SGEMM_L4x16_LOOP: KERNEL4x16_1 dcbt AO, PRE @@ -639,9 +726,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_2 addic. L, L, -1 - bgt .LSGEMM_L4x16_LOOP + bgt SGEMM_L4x16_LOOP -.LSGEMM_L4x16_LOOP_END: +SGEMM_L4x16_LOOP_END: KERNEL4x16_1 dcbt AO, PRE @@ -656,9 +743,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_1 KERNEL4x16_E2 - b .LSGEMM_L4x16_SUB1 + b SGEMM_L4x16_SUB1 -.LSGEMM_L4x16_SUB4: +SGEMM_L4x16_SUB4: dcbt AO, PRE KERNEL4x16_SUBI1 @@ -672,53 +759,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_SUB1 KERNEL4x16_SUB1 - b .LSGEMM_L4x16_SUB1 + b SGEMM_L4x16_SUB1 -.LSGEMM_L4x16_SUB0: +SGEMM_L4x16_SUB0: andi. L, K, 7 KERNEL4x16_SUBI1 addic. L, L, -1 - ble .LSGEMM_L4x16_SAVE - b .LSGEMM_L4x16_SUB2 + ble SGEMM_L4x16_SAVE + b SGEMM_L4x16_SUB2 -.LSGEMM_L4x16_SUB1: +SGEMM_L4x16_SUB1: andi. L, K, 7 - ble .LSGEMM_L4x16_SAVE + ble SGEMM_L4x16_SAVE -.LSGEMM_L4x16_SUB2: +SGEMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 - bgt .LSGEMM_L4x16_SUB2 + bgt SGEMM_L4x16_SUB2 -.LSGEMM_L4x16_SAVE: +SGEMM_L4x16_SAVE: SAVE4x16 addic. I, I, -1 - bgt .LSGEMM_L4x16_BEGIN + bgt SGEMM_L4x16_BEGIN -.LSGEMM_L4x16_END: +SGEMM_L4x16_END: -.LSGEMM_L4x8_BEGIN: +SGEMM_L4x8_BEGIN: andi. T2, M, 15 - ble .LSGEMM_L4x1_END + ble SGEMM_L4x1_END andi. T1, M, 8 - ble .LSGEMM_L4x8_END - mr BO, B + ble SGEMM_L4x8_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L4x8_SUB0 + ble SGEMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L4x8_SUB4 + ble SGEMM_L4x8_SUB4 -.LSGEMM_L4x8_LOOP_START: +SGEMM_L4x8_LOOP_START: LOAD4x8_1 KERNEL4x8_I1 @@ -732,11 +819,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -2 - ble .LSGEMM_L4x8_LOOP_END + ble SGEMM_L4x8_LOOP_END .align 5 -.LSGEMM_L4x8_LOOP: +SGEMM_L4x8_LOOP: KERNEL4x8_1 KERNEL4x8_2 @@ -749,9 +836,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -1 - bgt .LSGEMM_L4x8_LOOP + bgt SGEMM_L4x8_LOOP -.LSGEMM_L4x8_LOOP_END: +SGEMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 @@ -763,9 +850,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 KERNEL4x8_E2 - b .LSGEMM_L4x8_SUB1 + b SGEMM_L4x8_SUB1 -.LSGEMM_L4x8_SUB4: +SGEMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -777,48 +864,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b .LSGEMM_L4x8_SUB1 + b SGEMM_L4x8_SUB1 -.LSGEMM_L4x8_SUB0: +SGEMM_L4x8_SUB0: andi. L, K, 7 KERNEL4x8_SUBI1 addic. L, L, -1 - ble .LSGEMM_L4x8_SAVE - b .LSGEMM_L4x8_SUB2 + ble SGEMM_L4x8_SAVE + b SGEMM_L4x8_SUB2 -.LSGEMM_L4x8_SUB1: +SGEMM_L4x8_SUB1: andi. L, K, 7 - ble .LSGEMM_L4x8_SAVE + ble SGEMM_L4x8_SAVE -.LSGEMM_L4x8_SUB2: +SGEMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt .LSGEMM_L4x8_SUB2 + bgt SGEMM_L4x8_SUB2 -.LSGEMM_L4x8_SAVE: +SGEMM_L4x8_SAVE: SAVE4x8 -.LSGEMM_L4x8_END: +SGEMM_L4x8_END: -.LSGEMM_L4x4_BEGIN: +SGEMM_L4x4_BEGIN: andi. T1, M, 4 - ble .LSGEMM_L4x4_END - mr BO, B + ble SGEMM_L4x4_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L4x4_SUB0 + ble SGEMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L4x4_SUB4 + ble SGEMM_L4x4_SUB4 -.LSGEMM_L4x4_LOOP_START: +SGEMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 @@ -832,11 +919,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -2 - ble .LSGEMM_L4x4_LOOP_END + ble SGEMM_L4x4_LOOP_END .align 5 -.LSGEMM_L4x4_LOOP: +SGEMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 @@ -849,9 +936,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -1 - bgt .LSGEMM_L4x4_LOOP + bgt SGEMM_L4x4_LOOP -.LSGEMM_L4x4_LOOP_END: +SGEMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -863,9 +950,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_1 KERNEL4x4_E2 - b .LSGEMM_L4x4_SUB1 + b SGEMM_L4x4_SUB1 -.LSGEMM_L4x4_SUB4: +SGEMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -877,48 +964,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b .LSGEMM_L4x4_SUB1 + b SGEMM_L4x4_SUB1 -.LSGEMM_L4x4_SUB0: +SGEMM_L4x4_SUB0: andi. L, K, 7 KERNEL4x4_SUBI1 addic. L, L, -1 - ble .LSGEMM_L4x4_SAVE - b .LSGEMM_L4x4_SUB2 + ble SGEMM_L4x4_SAVE + b SGEMM_L4x4_SUB2 -.LSGEMM_L4x4_SUB1: +SGEMM_L4x4_SUB1: andi. L, K, 7 - ble .LSGEMM_L4x4_SAVE + ble SGEMM_L4x4_SAVE -.LSGEMM_L4x4_SUB2: +SGEMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt .LSGEMM_L4x4_SUB2 + bgt SGEMM_L4x4_SUB2 -.LSGEMM_L4x4_SAVE: +SGEMM_L4x4_SAVE: SAVE4x4 -.LSGEMM_L4x4_END: +SGEMM_L4x4_END: -.LSGEMM_L4x2_BEGIN: +SGEMM_L4x2_BEGIN: andi. T1, M, 2 - ble .LSGEMM_L4x2_END - mr BO, B + ble SGEMM_L4x2_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L4x2_SUB0 + ble SGEMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L4x2_SUB4 + ble SGEMM_L4x2_SUB4 -.LSGEMM_L4x2_LOOP_START: +SGEMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -932,11 +1019,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -2 - ble .LSGEMM_L4x2_LOOP_END + ble SGEMM_L4x2_LOOP_END .align 5 -.LSGEMM_L4x2_LOOP: +SGEMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -949,9 +1036,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -1 - bgt .LSGEMM_L4x2_LOOP + bgt SGEMM_L4x2_LOOP -.LSGEMM_L4x2_LOOP_END: +SGEMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -963,9 +1050,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_1 KERNEL4x2_E2 - b .LSGEMM_L4x2_SUB1 + b SGEMM_L4x2_SUB1 -.LSGEMM_L4x2_SUB4: +SGEMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -977,48 +1064,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b .LSGEMM_L4x2_SUB1 + b SGEMM_L4x2_SUB1 -.LSGEMM_L4x2_SUB0: +SGEMM_L4x2_SUB0: andi. L, K, 7 KERNEL4x2_SUBI1 addic. L, L, -1 - ble .LSGEMM_L4x2_SAVE - b .LSGEMM_L4x2_SUB2 + ble SGEMM_L4x2_SAVE + b SGEMM_L4x2_SUB2 -.LSGEMM_L4x2_SUB1: +SGEMM_L4x2_SUB1: andi. L, K, 7 - ble .LSGEMM_L4x2_SAVE + ble SGEMM_L4x2_SAVE -.LSGEMM_L4x2_SUB2: +SGEMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt .LSGEMM_L4x2_SUB2 + bgt SGEMM_L4x2_SUB2 -.LSGEMM_L4x2_SAVE: +SGEMM_L4x2_SAVE: SAVE4x2 -.LSGEMM_L4x2_END: +SGEMM_L4x2_END: -.LSGEMM_L4x1_BEGIN: +SGEMM_L4x1_BEGIN: andi. T1, M, 1 - ble .LSGEMM_L4x1_END - mr BO, B + ble SGEMM_L4x1_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L4x1_SUB0 + ble SGEMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L4x1_SUB4 + ble SGEMM_L4x1_SUB4 -.LSGEMM_L4x1_LOOP_START: +SGEMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -1032,11 +1119,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -2 - ble .LSGEMM_L4x1_LOOP_END + ble SGEMM_L4x1_LOOP_END .align 5 -.LSGEMM_L4x1_LOOP: +SGEMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -1049,9 +1136,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -1 - bgt .LSGEMM_L4x1_LOOP + bgt SGEMM_L4x1_LOOP -.LSGEMM_L4x1_LOOP_END: +SGEMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -1063,9 +1150,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_1 KERNEL4x1_E2 - b .LSGEMM_L4x1_SUB1 + b SGEMM_L4x1_SUB1 -.LSGEMM_L4x1_SUB4: +SGEMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -1077,61 +1164,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b .LSGEMM_L4x1_SUB1 + b SGEMM_L4x1_SUB1 -.LSGEMM_L4x1_SUB0: +SGEMM_L4x1_SUB0: andi. L, K, 7 KERNEL4x1_SUBI1 addic. L, L, -1 - ble .LSGEMM_L4x1_SAVE - b .LSGEMM_L4x1_SUB2 + ble SGEMM_L4x1_SAVE + b SGEMM_L4x1_SUB2 -.LSGEMM_L4x1_SUB1: +SGEMM_L4x1_SUB1: andi. L, K, 7 - ble .LSGEMM_L4x1_SAVE + ble SGEMM_L4x1_SAVE -.LSGEMM_L4x1_SUB2: +SGEMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt .LSGEMM_L4x1_SUB2 + bgt SGEMM_L4x1_SUB2 -.LSGEMM_L4x1_SAVE: +SGEMM_L4x1_SAVE: SAVE4x1 -.LSGEMM_L4x1_END: +SGEMM_L4x1_END: slwi T1, K, 4 add B, B, T1 -.LSGEMM_L4_END: -.LSGEMM_L2_BEGIN: +SGEMM_L4_END: +SGEMM_L2_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 1 + +SGEMM_L2_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge SGEMM_L2_COPYB andi. T1, N, 2 - ble .LSGEMM_L2_END + ble SGEMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 4 - ble .LSGEMM_L2x16_END + ble SGEMM_L2x16_END -.LSGEMM_L2x16_BEGIN: +SGEMM_L2x16_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L2x16_SUB0 + ble SGEMM_L2x16_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L2x16_SUB4 + ble SGEMM_L2x16_SUB4 -.LSGEMM_L2x16_LOOP_START: +SGEMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 @@ -1150,11 +1269,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -2 - ble .LSGEMM_L2x16_LOOP_END + ble SGEMM_L2x16_LOOP_END .align 5 -.LSGEMM_L2x16_LOOP: +SGEMM_L2x16_LOOP: KERNEL2x16_1 dcbt AO, PRE @@ -1171,9 +1290,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -1 - bgt .LSGEMM_L2x16_LOOP + bgt SGEMM_L2x16_LOOP -.LSGEMM_L2x16_LOOP_END: +SGEMM_L2x16_LOOP_END: KERNEL2x16_1 dcbt AO, PRE @@ -1188,9 +1307,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_1 KERNEL2x16_E2 - b .LSGEMM_L2x16_SUB1 + b SGEMM_L2x16_SUB1 -.LSGEMM_L2x16_SUB4: +SGEMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 @@ -1204,53 +1323,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_SUB1 KERNEL2x16_SUB1 - b .LSGEMM_L2x16_SUB1 + b SGEMM_L2x16_SUB1 -.LSGEMM_L2x16_SUB0: +SGEMM_L2x16_SUB0: andi. L, K, 7 KERNEL2x16_SUBI1 addic. L, L, -1 - ble .LSGEMM_L2x16_SAVE - b .LSGEMM_L2x16_SUB2 + ble SGEMM_L2x16_SAVE + b SGEMM_L2x16_SUB2 -.LSGEMM_L2x16_SUB1: +SGEMM_L2x16_SUB1: andi. L, K, 7 - ble .LSGEMM_L2x16_SAVE + ble SGEMM_L2x16_SAVE -.LSGEMM_L2x16_SUB2: +SGEMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 - bgt .LSGEMM_L2x16_SUB2 + bgt SGEMM_L2x16_SUB2 -.LSGEMM_L2x16_SAVE: +SGEMM_L2x16_SAVE: SAVE2x16 addic. I, I, -1 - bgt .LSGEMM_L2x16_BEGIN + bgt SGEMM_L2x16_BEGIN -.LSGEMM_L2x16_END: +SGEMM_L2x16_END: -.LSGEMM_L2x8_BEGIN: +SGEMM_L2x8_BEGIN: andi. T2, M, 15 - ble .LSGEMM_L2x1_END + ble SGEMM_L2x1_END andi. T1, M, 8 - ble .LSGEMM_L2x8_END - mr BO, B + ble SGEMM_L2x8_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L2x8_SUB0 + ble SGEMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L2x8_SUB4 + ble SGEMM_L2x8_SUB4 -.LSGEMM_L2x8_LOOP_START: +SGEMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 @@ -1264,11 +1383,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -2 - ble .LSGEMM_L2x8_LOOP_END + ble SGEMM_L2x8_LOOP_END .align 5 -.LSGEMM_L2x8_LOOP: +SGEMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 @@ -1281,9 +1400,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -1 - bgt .LSGEMM_L2x8_LOOP + bgt SGEMM_L2x8_LOOP -.LSGEMM_L2x8_LOOP_END: +SGEMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 @@ -1295,9 +1414,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_1 KERNEL2x8_E2 - b .LSGEMM_L2x8_SUB1 + b SGEMM_L2x8_SUB1 -.LSGEMM_L2x8_SUB4: +SGEMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -1309,48 +1428,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b .LSGEMM_L2x8_SUB1 + b SGEMM_L2x8_SUB1 -.LSGEMM_L2x8_SUB0: +SGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 - ble .LSGEMM_L2x8_SAVE - b .LSGEMM_L2x8_SUB2 + ble SGEMM_L2x8_SAVE + b SGEMM_L2x8_SUB2 -.LSGEMM_L2x8_SUB1: +SGEMM_L2x8_SUB1: andi. L, K, 7 - ble .LSGEMM_L2x8_SAVE + ble SGEMM_L2x8_SAVE -.LSGEMM_L2x8_SUB2: +SGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt .LSGEMM_L2x8_SUB2 + bgt SGEMM_L2x8_SUB2 -.LSGEMM_L2x8_SAVE: +SGEMM_L2x8_SAVE: SAVE2x8 -.LSGEMM_L2x8_END: +SGEMM_L2x8_END: -.LSGEMM_L2x4_BEGIN: +SGEMM_L2x4_BEGIN: andi. T1, M, 4 - ble .LSGEMM_L2x4_END - mr BO, B + ble SGEMM_L2x4_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L2x4_SUB0 + ble SGEMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L2x4_SUB4 + ble SGEMM_L2x4_SUB4 -.LSGEMM_L2x4_LOOP_START: +SGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -1364,11 +1483,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -2 - ble .LSGEMM_L2x4_LOOP_END + ble SGEMM_L2x4_LOOP_END .align 5 -.LSGEMM_L2x4_LOOP: +SGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -1381,9 +1500,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -1 - bgt .LSGEMM_L2x4_LOOP + bgt SGEMM_L2x4_LOOP -.LSGEMM_L2x4_LOOP_END: +SGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -1395,9 +1514,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_1 KERNEL2x4_E2 - b .LSGEMM_L2x4_SUB1 + b SGEMM_L2x4_SUB1 -.LSGEMM_L2x4_SUB4: +SGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -1409,48 +1528,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b .LSGEMM_L2x4_SUB1 + b SGEMM_L2x4_SUB1 -.LSGEMM_L2x4_SUB0: +SGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 - ble .LSGEMM_L2x4_SAVE - b .LSGEMM_L2x4_SUB2 + ble SGEMM_L2x4_SAVE + b SGEMM_L2x4_SUB2 -.LSGEMM_L2x4_SUB1: +SGEMM_L2x4_SUB1: andi. L, K, 7 - ble .LSGEMM_L2x4_SAVE + ble SGEMM_L2x4_SAVE -.LSGEMM_L2x4_SUB2: +SGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt .LSGEMM_L2x4_SUB2 + bgt SGEMM_L2x4_SUB2 -.LSGEMM_L2x4_SAVE: +SGEMM_L2x4_SAVE: SAVE2x4 -.LSGEMM_L2x4_END: +SGEMM_L2x4_END: -.LSGEMM_L2x2_BEGIN: +SGEMM_L2x2_BEGIN: andi. T1, M, 2 - ble .LSGEMM_L2x2_END - mr BO, B + ble SGEMM_L2x2_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L2x2_SUB0 + ble SGEMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L2x2_SUB4 + ble SGEMM_L2x2_SUB4 -.LSGEMM_L2x2_LOOP_START: +SGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -1464,11 +1583,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -2 - ble .LSGEMM_L2x2_LOOP_END + ble SGEMM_L2x2_LOOP_END .align 5 -.LSGEMM_L2x2_LOOP: +SGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -1481,9 +1600,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -1 - bgt .LSGEMM_L2x2_LOOP + bgt SGEMM_L2x2_LOOP -.LSGEMM_L2x2_LOOP_END: +SGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -1495,9 +1614,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_1 KERNEL2x2_E2 - b .LSGEMM_L2x2_SUB1 + b SGEMM_L2x2_SUB1 -.LSGEMM_L2x2_SUB4: +SGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -1509,48 +1628,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b .LSGEMM_L2x2_SUB1 + b SGEMM_L2x2_SUB1 -.LSGEMM_L2x2_SUB0: +SGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 - ble .LSGEMM_L2x2_SAVE - b .LSGEMM_L2x2_SUB2 + ble SGEMM_L2x2_SAVE + b SGEMM_L2x2_SUB2 -.LSGEMM_L2x2_SUB1: +SGEMM_L2x2_SUB1: andi. L, K, 7 - ble .LSGEMM_L2x2_SAVE + ble SGEMM_L2x2_SAVE -.LSGEMM_L2x2_SUB2: +SGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt .LSGEMM_L2x2_SUB2 + bgt SGEMM_L2x2_SUB2 -.LSGEMM_L2x2_SAVE: +SGEMM_L2x2_SAVE: SAVE2x2 -.LSGEMM_L2x2_END: +SGEMM_L2x2_END: -.LSGEMM_L2x1_BEGIN: +SGEMM_L2x1_BEGIN: andi. T1, M, 1 - ble .LSGEMM_L2x1_END - mr BO, B + ble SGEMM_L2x1_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L2x1_SUB0 + ble SGEMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L2x1_SUB4 + ble SGEMM_L2x1_SUB4 -.LSGEMM_L2x1_LOOP_START: +SGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -1564,11 +1683,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -2 - ble .LSGEMM_L2x1_LOOP_END + ble SGEMM_L2x1_LOOP_END .align 5 -.LSGEMM_L2x1_LOOP: +SGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -1581,9 +1700,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -1 - bgt .LSGEMM_L2x1_LOOP + bgt SGEMM_L2x1_LOOP -.LSGEMM_L2x1_LOOP_END: +SGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -1595,9 +1714,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_1 KERNEL2x1_E2 - b .LSGEMM_L2x1_SUB1 + b SGEMM_L2x1_SUB1 -.LSGEMM_L2x1_SUB4: +SGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -1609,59 +1728,91 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b .LSGEMM_L2x1_SUB1 + b SGEMM_L2x1_SUB1 -.LSGEMM_L2x1_SUB0: +SGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 - ble .LSGEMM_L2x1_SAVE - b .LSGEMM_L2x1_SUB2 + ble SGEMM_L2x1_SAVE + b SGEMM_L2x1_SUB2 -.LSGEMM_L2x1_SUB1: +SGEMM_L2x1_SUB1: andi. L, K, 7 - ble .LSGEMM_L2x1_SAVE + ble SGEMM_L2x1_SAVE -.LSGEMM_L2x1_SUB2: +SGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt .LSGEMM_L2x1_SUB2 + bgt SGEMM_L2x1_SUB2 -.LSGEMM_L2x1_SAVE: +SGEMM_L2x1_SAVE: SAVE2x1 -.LSGEMM_L2x1_END: +SGEMM_L2x1_END: slwi T1, K, 3 add B, B, T1 -.LSGEMM_L2_END: -.LSGEMM_L1_BEGIN: +SGEMM_L2_END: +SGEMM_L1_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 0 + +SGEMM_L1_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge SGEMM_L1_COPYB andi. T1, N, 1 - ble .LSGEMM_L1_END + ble SGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 4 - ble .LSGEMM_L1x16_END + ble SGEMM_L1x16_END -.LSGEMM_L1x16_BEGIN: +SGEMM_L1x16_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L1x16_SUB0 + ble SGEMM_L1x16_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L1x16_SUB4 + ble SGEMM_L1x16_SUB4 -.LSGEMM_L1x16_LOOP_START: +SGEMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 @@ -1680,11 +1831,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -2 - ble .LSGEMM_L1x16_LOOP_END + ble SGEMM_L1x16_LOOP_END .align 5 -.LSGEMM_L1x16_LOOP: +SGEMM_L1x16_LOOP: KERNEL1x16_1 dcbt AO, PRE @@ -1701,9 +1852,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -1 - bgt .LSGEMM_L1x16_LOOP + bgt SGEMM_L1x16_LOOP -.LSGEMM_L1x16_LOOP_END: +SGEMM_L1x16_LOOP_END: KERNEL1x16_1 dcbt AO, PRE @@ -1718,9 +1869,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_1 KERNEL1x16_E2 - b .LSGEMM_L1x16_SUB1 + b SGEMM_L1x16_SUB1 -.LSGEMM_L1x16_SUB4: +SGEMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 @@ -1734,53 +1885,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_SUB1 KERNEL1x16_SUB1 - b .LSGEMM_L1x16_SUB1 + b SGEMM_L1x16_SUB1 -.LSGEMM_L1x16_SUB0: +SGEMM_L1x16_SUB0: andi. L, K, 7 KERNEL1x16_SUBI1 addic. L, L, -1 - ble .LSGEMM_L1x16_SAVE - b .LSGEMM_L1x16_SUB2 + ble SGEMM_L1x16_SAVE + b SGEMM_L1x16_SUB2 -.LSGEMM_L1x16_SUB1: +SGEMM_L1x16_SUB1: andi. L, K, 7 - ble .LSGEMM_L1x16_SAVE + ble SGEMM_L1x16_SAVE -.LSGEMM_L1x16_SUB2: +SGEMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 - bgt .LSGEMM_L1x16_SUB2 + bgt SGEMM_L1x16_SUB2 -.LSGEMM_L1x16_SAVE: +SGEMM_L1x16_SAVE: SAVE1x16 addic. I, I, -1 - bgt .LSGEMM_L1x16_BEGIN + bgt SGEMM_L1x16_BEGIN -.LSGEMM_L1x16_END: +SGEMM_L1x16_END: -.LSGEMM_L1x8_BEGIN: +SGEMM_L1x8_BEGIN: andi. T2, M, 15 - ble .LSGEMM_L1x1_END + ble SGEMM_L1x1_END andi. T1, M, 8 - ble .LSGEMM_L1x8_END - mr BO, B + ble SGEMM_L1x8_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L1x8_SUB0 + ble SGEMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L1x8_SUB4 + ble SGEMM_L1x8_SUB4 -.LSGEMM_L1x8_LOOP_START: +SGEMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 @@ -1794,11 +1945,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -2 - ble .LSGEMM_L1x8_LOOP_END + ble SGEMM_L1x8_LOOP_END .align 5 -.LSGEMM_L1x8_LOOP: +SGEMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 @@ -1811,9 +1962,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -1 - bgt .LSGEMM_L1x8_LOOP + bgt SGEMM_L1x8_LOOP -.LSGEMM_L1x8_LOOP_END: +SGEMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 @@ -1825,9 +1976,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_1 KERNEL1x8_E2 - b .LSGEMM_L1x8_SUB1 + b SGEMM_L1x8_SUB1 -.LSGEMM_L1x8_SUB4: +SGEMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -1839,48 +1990,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b .LSGEMM_L1x8_SUB1 + b SGEMM_L1x8_SUB1 -.LSGEMM_L1x8_SUB0: +SGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 - ble .LSGEMM_L1x8_SAVE - b .LSGEMM_L1x8_SUB2 + ble SGEMM_L1x8_SAVE + b SGEMM_L1x8_SUB2 -.LSGEMM_L1x8_SUB1: +SGEMM_L1x8_SUB1: andi. L, K, 7 - ble .LSGEMM_L1x8_SAVE + ble SGEMM_L1x8_SAVE -.LSGEMM_L1x8_SUB2: +SGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt .LSGEMM_L1x8_SUB2 + bgt SGEMM_L1x8_SUB2 -.LSGEMM_L1x8_SAVE: +SGEMM_L1x8_SAVE: SAVE1x8 -.LSGEMM_L1x8_END: +SGEMM_L1x8_END: -.LSGEMM_L1x4_BEGIN: +SGEMM_L1x4_BEGIN: andi. T1, M, 4 - ble .LSGEMM_L1x4_END - mr BO, B + ble SGEMM_L1x4_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L1x4_SUB0 + ble SGEMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L1x4_SUB4 + ble SGEMM_L1x4_SUB4 -.LSGEMM_L1x4_LOOP_START: +SGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -1894,11 +2045,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -2 - ble .LSGEMM_L1x4_LOOP_END + ble SGEMM_L1x4_LOOP_END .align 5 -.LSGEMM_L1x4_LOOP: +SGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -1911,9 +2062,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -1 - bgt .LSGEMM_L1x4_LOOP + bgt SGEMM_L1x4_LOOP -.LSGEMM_L1x4_LOOP_END: +SGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -1925,9 +2076,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_1 KERNEL1x4_E2 - b .LSGEMM_L1x4_SUB1 + b SGEMM_L1x4_SUB1 -.LSGEMM_L1x4_SUB4: +SGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -1939,48 +2090,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b .LSGEMM_L1x4_SUB1 + b SGEMM_L1x4_SUB1 -.LSGEMM_L1x4_SUB0: +SGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 - ble .LSGEMM_L1x4_SAVE - b .LSGEMM_L1x4_SUB2 + ble SGEMM_L1x4_SAVE + b SGEMM_L1x4_SUB2 -.LSGEMM_L1x4_SUB1: +SGEMM_L1x4_SUB1: andi. L, K, 7 - ble .LSGEMM_L1x4_SAVE + ble SGEMM_L1x4_SAVE -.LSGEMM_L1x4_SUB2: +SGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt .LSGEMM_L1x4_SUB2 + bgt SGEMM_L1x4_SUB2 -.LSGEMM_L1x4_SAVE: +SGEMM_L1x4_SAVE: SAVE1x4 -.LSGEMM_L1x4_END: +SGEMM_L1x4_END: -.LSGEMM_L1x2_BEGIN: +SGEMM_L1x2_BEGIN: andi. T1, M, 2 - ble .LSGEMM_L1x2_END - mr BO, B + ble SGEMM_L1x2_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L1x2_SUB0 + ble SGEMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L1x2_SUB4 + ble SGEMM_L1x2_SUB4 -.LSGEMM_L1x2_LOOP_START: +SGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -1994,11 +2145,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -2 - ble .LSGEMM_L1x2_LOOP_END + ble SGEMM_L1x2_LOOP_END .align 5 -.LSGEMM_L1x2_LOOP: +SGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -2011,9 +2162,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -1 - bgt .LSGEMM_L1x2_LOOP + bgt SGEMM_L1x2_LOOP -.LSGEMM_L1x2_LOOP_END: +SGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -2025,9 +2176,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_1 KERNEL1x2_E2 - b .LSGEMM_L1x2_SUB1 + b SGEMM_L1x2_SUB1 -.LSGEMM_L1x2_SUB4: +SGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -2039,48 +2190,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b .LSGEMM_L1x2_SUB1 + b SGEMM_L1x2_SUB1 -.LSGEMM_L1x2_SUB0: +SGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 - ble .LSGEMM_L1x2_SAVE - b .LSGEMM_L1x2_SUB2 + ble SGEMM_L1x2_SAVE + b SGEMM_L1x2_SUB2 -.LSGEMM_L1x2_SUB1: +SGEMM_L1x2_SUB1: andi. L, K, 7 - ble .LSGEMM_L1x2_SAVE + ble SGEMM_L1x2_SAVE -.LSGEMM_L1x2_SUB2: +SGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt .LSGEMM_L1x2_SUB2 + bgt SGEMM_L1x2_SUB2 -.LSGEMM_L1x2_SAVE: +SGEMM_L1x2_SAVE: SAVE1x2 -.LSGEMM_L1x2_END: +SGEMM_L1x2_END: -.LSGEMM_L1x1_BEGIN: +SGEMM_L1x1_BEGIN: andi. T1, M, 1 - ble .LSGEMM_L1x1_END - mr BO, B + ble SGEMM_L1x1_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LSGEMM_L1x1_SUB0 + ble SGEMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble .LSGEMM_L1x1_SUB4 + ble SGEMM_L1x1_SUB4 -.LSGEMM_L1x1_LOOP_START: +SGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -2094,11 +2245,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -2 - ble .LSGEMM_L1x1_LOOP_END + ble SGEMM_L1x1_LOOP_END .align 5 -.LSGEMM_L1x1_LOOP: +SGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -2111,9 +2262,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -1 - bgt .LSGEMM_L1x1_LOOP + bgt SGEMM_L1x1_LOOP -.LSGEMM_L1x1_LOOP_END: +SGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -2125,9 +2276,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_1 KERNEL1x1_E2 - b .LSGEMM_L1x1_SUB1 + b SGEMM_L1x1_SUB1 -.LSGEMM_L1x1_SUB4: +SGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -2139,34 +2290,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b .LSGEMM_L1x1_SUB1 + b SGEMM_L1x1_SUB1 -.LSGEMM_L1x1_SUB0: +SGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 - ble .LSGEMM_L1x1_SAVE - b .LSGEMM_L1x1_SUB2 + ble SGEMM_L1x1_SAVE + b SGEMM_L1x1_SUB2 -.LSGEMM_L1x1_SUB1: +SGEMM_L1x1_SUB1: andi. L, K, 7 - ble .LSGEMM_L1x1_SAVE + ble SGEMM_L1x1_SAVE -.LSGEMM_L1x1_SUB2: +SGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt .LSGEMM_L1x1_SUB2 + bgt SGEMM_L1x1_SUB2 -.LSGEMM_L1x1_SAVE: +SGEMM_L1x1_SAVE: SAVE1x1 -.LSGEMM_L1x1_END: +SGEMM_L1x1_END: -.LSGEMM_L1_END: +SGEMM_L1_END: diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S index a2d36c089..71dc52979 100644 --- a/kernel/power/sgemm_macros_16x8_power8.S +++ b/kernel/power/sgemm_macros_16x8_power8.S @@ -26,13 +26,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ + /********************************************************************************************** * Macros for N=8 and M=16 **********************************************************************************************/ @@ -46,21 +47,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 .endm @@ -74,21 +75,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmulsp vs32, vs0, vs8 @@ -136,42 +137,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x16_1 + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 + + addi BO, BO, 128 + + xvmaddasp vs32, vs0, vs8 xvmaddasp vs33, vs1, vs8 - lxvw4x vs28, o0, BO - lxvw4x vs4, o0, AO xvmaddasp vs34, vs2, vs8 xvmaddasp vs35, vs3, vs8 xvmaddasp vs36, vs0, vs9 xvmaddasp vs37, vs1, vs9 - lxvw4x vs29, o16, BO - lxvw4x vs5, o16, AO xvmaddasp vs38, vs2, vs9 xvmaddasp vs39, vs3, vs9 xvmaddasp vs40, vs0, vs10 xvmaddasp vs41, vs1, vs10 - lxvw4x vs6, o32, AO - lxvw4x vs7, o48, AO xvmaddasp vs42, vs2, vs10 xvmaddasp vs43, vs3, vs10 - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 - xvmaddasp vs44, vs0, vs11 xvmaddasp vs45, vs1, vs11 xvmaddasp vs46, vs2, vs11 xvmaddasp vs47, vs3, vs11 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 - xvmaddasp vs48, vs0, vs12 xvmaddasp vs49, vs1, vs12 xvmaddasp vs50, vs2, vs12 @@ -184,8 +194,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs56, vs0, vs14 xvmaddasp vs57, vs1, vs14 - addi AO, AO, 64 - addi BO, BO, 32 xvmaddasp vs58, vs2, vs14 xvmaddasp vs59, vs3, vs14 @@ -199,47 +207,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x16_2 - xvmaddasp vs32, vs4, vs16 - xvmaddasp vs33, vs5, vs16 - lxvw4x vs28, o0, BO lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + mr T1, BO + + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 + + addi T1, T1, 64 + + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 + addi BO, BO, 128 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 xvmaddasp vs34, vs6, vs16 xvmaddasp vs35, vs7, vs16 xvmaddasp vs36, vs4, vs17 xvmaddasp vs37, vs5, vs17 - - lxvw4x vs29, o16, BO - lxvw4x vs1, o16, AO - xvmaddasp vs38, vs6, vs17 xvmaddasp vs39, vs7, vs17 - lxvw4x vs2, o32, AO - lxvw4x vs3, o48, AO - xvmaddasp vs40, vs4, vs18 xvmaddasp vs41, vs5, vs18 xvmaddasp vs42, vs6, vs18 xvmaddasp vs43, vs7, vs18 - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 - xvmaddasp vs44, vs4, vs19 xvmaddasp vs45, vs5, vs19 xvmaddasp vs46, vs6, vs19 xvmaddasp vs47, vs7, vs19 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 - xvmaddasp vs48, vs4, vs20 xvmaddasp vs49, vs5, vs20 xvmaddasp vs50, vs6, vs20 @@ -257,8 +269,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs60, vs4, vs23 xvmaddasp vs61, vs5, vs23 - addi AO, AO, 64 - addi BO, BO, 32 xvmaddasp vs62, vs6, vs23 xvmaddasp vs63, vs7, vs23 @@ -321,21 +331,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmulsp vs32, vs0, vs8 @@ -391,21 +401,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 @@ -464,106 +474,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr #endif - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -581,106 +503,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs36, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr #endif - stxvw4x vs37, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs38, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs39, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -698,106 +532,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs40, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs41, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs42, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs43, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -815,106 +561,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs44, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs45, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs46, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs47, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -932,106 +590,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs48, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs48, alpha_vr + xvmulsp vs1, vs49, alpha_vr + xvmulsp vs2, vs50, alpha_vr + xvmulsp vs3, vs51, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs48, alpha_vr + xvmaddasp vs1, vs49, alpha_vr + xvmaddasp vs2, vs50, alpha_vr + xvmaddasp vs3, vs51, alpha_vr #endif - stxvw4x vs49, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs50, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs51, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -1049,106 +619,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs52, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs52, alpha_vr + xvmulsp vs1, vs53, alpha_vr + xvmulsp vs2, vs54, alpha_vr + xvmulsp vs3, vs55, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs52, alpha_vr + xvmaddasp vs1, vs53, alpha_vr + xvmaddasp vs2, vs54, alpha_vr + xvmaddasp vs3, vs55, alpha_vr #endif - stxvw4x vs53, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs54, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs55, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -1166,106 +648,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs56, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs57, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs58, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs59, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER + xvmulsp vs0, vs56, alpha_vr + xvmulsp vs1, vs57, alpha_vr + xvmulsp vs2, vs58, alpha_vr + xvmulsp vs3, vs59, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 + xvmaddasp vs0, vs56, alpha_vr + xvmaddasp vs1, vs57, alpha_vr + xvmaddasp vs2, vs58, alpha_vr + xvmaddasp vs3, vs59, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -1283,106 +677,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs60, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs61, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs62, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs63, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER + xvmulsp vs0, vs60, alpha_vr + xvmulsp vs1, vs61, alpha_vr + xvmulsp vs2, vs62, alpha_vr + xvmulsp vs3, vs63, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 + xvmaddasp vs0, vs60, alpha_vr + xvmaddasp vs1, vs61, alpha_vr + xvmaddasp vs2, vs62, alpha_vr + xvmaddasp vs3, vs63, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -1406,21 +712,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 .endm @@ -1432,21 +738,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmulsp vs32, vs0, vs8 @@ -1484,21 +790,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 @@ -1536,21 +842,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmaddasp vs32, vs4, vs16 @@ -1618,21 +924,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmulsp vs32, vs0, vs8 @@ -1670,21 +976,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 @@ -1725,58 +1031,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1790,58 +1052,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr #endif - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1855,58 +1073,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs36, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr #endif - stxvw4x vs37, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1920,58 +1094,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs38, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs39, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -1985,58 +1115,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs40, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs41, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -2050,58 +1136,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs42, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs43, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER + xvmulsp vs0, vs42, alpha_vr + xvmulsp vs1, vs43, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 + xvmaddasp vs0, vs42, alpha_vr + xvmaddasp vs1, vs43, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -2115,58 +1157,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs44, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs45, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -2180,58 +1178,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs46, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs46, alpha_vr + xvmulsp vs1, vs47, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs46, alpha_vr + xvmaddasp vs1, vs47, alpha_vr #endif - stxvw4x vs47, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -2252,21 +1206,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 .endm @@ -2277,21 +1231,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmulsp vs32, vs0, vs8 @@ -2320,21 +1274,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs20, vs29, 0 - xxspltw vs21, vs29, 1 - xxspltw vs22, vs29, 2 - xxspltw vs23, vs29, 3 + lxvw4x vs20, o0, T1 + lxvw4x vs21, o16, T1 + lxvw4x vs22, o32, T1 + lxvw4x vs23, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 @@ -2363,21 +1317,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmaddasp vs32, vs4, vs16 @@ -2428,21 +1382,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmulsp vs32, vs0, vs8 @@ -2471,21 +1425,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - lxvw4x vs29, o16, BO + addi T1, T1, 64 - xxspltw vs12, vs29, 0 - xxspltw vs13, vs29, 1 - xxspltw vs14, vs29, 2 - xxspltw vs15, vs29, 3 + lxvw4x vs12, o0, T1 + lxvw4x vs13, o16, T1 + lxvw4x vs14, o32, T1 + lxvw4x vs15, o48, T1 - addi BO, BO, 32 + addi BO, BO, 128 xvmaddasp vs32, vs0, vs8 @@ -2517,34 +1471,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2556,34 +1488,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs33, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs33, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2595,34 +1505,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs34, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs34, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2634,34 +1522,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs35, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2673,33 +1539,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs36, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - - + xvmulsp vs0, vs36, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr +#endif stxvw4x vs0, o0, T1 @@ -2712,34 +1556,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs37, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs37, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs37, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2751,34 +1573,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs38, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs38, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs38, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2790,34 +1590,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs39, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs39, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs39, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -2841,18 +1619,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - addi BO, BO, 32 + + addi BO, BO, 128 .endm @@ -2867,43 +1646,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs20, o0, T1 - lxsspx vs21, o4, T1 - lxsspx vs22, o8, T1 - lxsspx vs23, o12, T1 + lxsspx vs21, o16, T1 + lxsspx vs22, o32, T1 + lxsspx vs23, o48, T1 - addi BO, BO, 32 + + addi BO, BO, 128 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 - xsmulsp vs34, vs0, vs9 - xsmulsp vs35, vs1, vs9 + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 - xsmulsp vs36, vs0, vs10 - xsmulsp vs37, vs1, vs10 + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 - xsmulsp vs38, vs0, vs11 - xsmulsp vs39, vs1, vs11 + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 - xsmulsp vs40, vs0, vs12 - xsmulsp vs41, vs1, vs12 + xsmuldp vs40, vs0, vs12 + xsmuldp vs41, vs1, vs12 - xsmulsp vs42, vs0, vs13 - xsmulsp vs43, vs1, vs13 + xsmuldp vs42, vs0, vs13 + xsmuldp vs43, vs1, vs13 - xsmulsp vs44, vs0, vs14 - xsmulsp vs45, vs1, vs14 + xsmuldp vs44, vs0, vs14 + xsmuldp vs45, vs1, vs14 - xsmulsp vs46, vs0, vs15 - xsmulsp vs47, vs1, vs15 + xsmuldp vs46, vs0, vs15 + xsmuldp vs47, vs1, vs15 .endm @@ -2919,43 +1699,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs20, o0, T1 - lxsspx vs21, o4, T1 - lxsspx vs22, o8, T1 - lxsspx vs23, o12, T1 + lxsspx vs21, o16, T1 + lxsspx vs22, o32, T1 + lxsspx vs23, o48, T1 - addi BO, BO, 32 + + addi BO, BO, 128 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 - xsmaddasp vs34, vs0, vs9 - xsmaddasp vs35, vs1, vs9 + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 - xsmaddasp vs36, vs0, vs10 - xsmaddasp vs37, vs1, vs10 + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 - xsmaddasp vs38, vs0, vs11 - xsmaddasp vs39, vs1, vs11 + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 - xsmaddasp vs40, vs0, vs12 - xsmaddasp vs41, vs1, vs12 + xsmaddadp vs40, vs0, vs12 + xsmaddadp vs41, vs1, vs12 - xsmaddasp vs42, vs0, vs13 - xsmaddasp vs43, vs1, vs13 + xsmaddadp vs42, vs0, vs13 + xsmaddadp vs43, vs1, vs13 - xsmaddasp vs44, vs0, vs14 - xsmaddasp vs45, vs1, vs14 + xsmaddadp vs44, vs0, vs14 + xsmaddadp vs45, vs1, vs14 - xsmaddasp vs46, vs0, vs15 - xsmaddasp vs47, vs1, vs15 + xsmaddadp vs46, vs0, vs15 + xsmaddadp vs47, vs1, vs15 .endm @@ -2971,43 +1752,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - addi BO, BO, 32 + + addi BO, BO, 128 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 - xsmaddasp vs34, vs4, vs17 - xsmaddasp vs35, vs5, vs17 + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 - xsmaddasp vs36, vs4, vs18 - xsmaddasp vs37, vs5, vs18 + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 - xsmaddasp vs38, vs4, vs19 - xsmaddasp vs39, vs5, vs19 + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 - xsmaddasp vs40, vs4, vs20 - xsmaddasp vs41, vs5, vs20 + xsmaddadp vs40, vs4, vs20 + xsmaddadp vs41, vs5, vs20 - xsmaddasp vs42, vs4, vs21 - xsmaddasp vs43, vs5, vs21 + xsmaddadp vs42, vs4, vs21 + xsmaddadp vs43, vs5, vs21 - xsmaddasp vs44, vs4, vs22 - xsmaddasp vs45, vs5, vs22 + xsmaddadp vs44, vs4, vs22 + xsmaddadp vs45, vs5, vs22 - xsmaddasp vs46, vs4, vs23 - xsmaddasp vs47, vs5, vs23 + xsmaddadp vs46, vs4, vs23 + xsmaddadp vs47, vs5, vs23 .endm @@ -3015,29 +1797,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x2_E2 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 - xsmaddasp vs34, vs4, vs17 - xsmaddasp vs35, vs5, vs17 + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 - xsmaddasp vs36, vs4, vs18 - xsmaddasp vs37, vs5, vs18 + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 - xsmaddasp vs38, vs4, vs19 - xsmaddasp vs39, vs5, vs19 + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 - xsmaddasp vs40, vs4, vs20 - xsmaddasp vs41, vs5, vs20 + xsmaddadp vs40, vs4, vs20 + xsmaddadp vs41, vs5, vs20 - xsmaddasp vs42, vs4, vs21 - xsmaddasp vs43, vs5, vs21 + xsmaddadp vs42, vs4, vs21 + xsmaddadp vs43, vs5, vs21 - xsmaddasp vs44, vs4, vs22 - xsmaddasp vs45, vs5, vs22 + xsmaddadp vs44, vs4, vs22 + xsmaddadp vs45, vs5, vs22 - xsmaddasp vs46, vs4, vs23 - xsmaddasp vs47, vs5, vs23 + xsmaddadp vs46, vs4, vs23 + xsmaddadp vs47, vs5, vs23 .endm @@ -3053,43 +1835,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - addi BO, BO, 32 + + addi BO, BO, 128 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 - xsmulsp vs34, vs0, vs9 - xsmulsp vs35, vs1, vs9 + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 - xsmulsp vs36, vs0, vs10 - xsmulsp vs37, vs1, vs10 + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 - xsmulsp vs38, vs0, vs11 - xsmulsp vs39, vs1, vs11 + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 - xsmulsp vs40, vs0, vs12 - xsmulsp vs41, vs1, vs12 + xsmuldp vs40, vs0, vs12 + xsmuldp vs41, vs1, vs12 - xsmulsp vs42, vs0, vs13 - xsmulsp vs43, vs1, vs13 + xsmuldp vs42, vs0, vs13 + xsmuldp vs43, vs1, vs13 - xsmulsp vs44, vs0, vs14 - xsmulsp vs45, vs1, vs14 + xsmuldp vs44, vs0, vs14 + xsmuldp vs45, vs1, vs14 - xsmulsp vs46, vs0, vs15 - xsmulsp vs47, vs1, vs15 + xsmuldp vs46, vs0, vs15 + xsmuldp vs47, vs1, vs15 .endm @@ -3105,43 +1888,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - addi BO, BO, 32 + + addi BO, BO, 128 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 - xsmaddasp vs34, vs0, vs9 - xsmaddasp vs35, vs1, vs9 + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 - xsmaddasp vs36, vs0, vs10 - xsmaddasp vs37, vs1, vs10 + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 - xsmaddasp vs38, vs0, vs11 - xsmaddasp vs39, vs1, vs11 + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 - xsmaddasp vs40, vs0, vs12 - xsmaddasp vs41, vs1, vs12 + xsmaddadp vs40, vs0, vs12 + xsmaddadp vs41, vs1, vs12 - xsmaddasp vs42, vs0, vs13 - xsmaddasp vs43, vs1, vs13 + xsmaddadp vs42, vs0, vs13 + xsmaddadp vs43, vs1, vs13 - xsmaddasp vs44, vs0, vs14 - xsmaddasp vs45, vs1, vs14 + xsmaddadp vs44, vs0, vs14 + xsmaddadp vs45, vs1, vs14 - xsmaddasp vs46, vs0, vs15 - xsmaddasp vs47, vs1, vs15 + xsmaddadp vs46, vs0, vs15 + xsmaddadp vs47, vs1, vs15 .endm @@ -3158,17 +1942,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - xsmulsp vs1, vs33, alpha_r - + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs33, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 @@ -3185,17 +1963,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs34, alpha_r - xsmulsp vs1, vs35, alpha_r - + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r #else - - xsmulsp vs28, vs34, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs35, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r #endif stxsspx vs0, o0, T1 @@ -3212,17 +1984,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs36, alpha_r - xsmulsp vs1, vs37, alpha_r - + xsmuldp vs0, vs36, alpha_r + xsmuldp vs1, vs37, alpha_r #else - - xsmulsp vs28, vs36, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs37, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs36, alpha_r + xsmaddadp vs1, vs37, alpha_r #endif stxsspx vs0, o0, T1 @@ -3239,17 +2005,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs38, alpha_r - xsmulsp vs1, vs39, alpha_r - + xsmuldp vs0, vs38, alpha_r + xsmuldp vs1, vs39, alpha_r #else - - xsmulsp vs28, vs38, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs39, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs38, alpha_r + xsmaddadp vs1, vs39, alpha_r #endif stxsspx vs0, o0, T1 @@ -3266,17 +2026,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs40, alpha_r - xsmulsp vs1, vs41, alpha_r - + xsmuldp vs0, vs40, alpha_r + xsmuldp vs1, vs41, alpha_r #else - - xsmulsp vs28, vs40, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs41, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs40, alpha_r + xsmaddadp vs1, vs41, alpha_r #endif stxsspx vs0, o0, T1 @@ -3293,17 +2047,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs42, alpha_r - xsmulsp vs1, vs43, alpha_r - + xsmuldp vs0, vs42, alpha_r + xsmuldp vs1, vs43, alpha_r #else - - xsmulsp vs28, vs42, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs43, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs42, alpha_r + xsmaddadp vs1, vs43, alpha_r #endif stxsspx vs0, o0, T1 @@ -3320,17 +2068,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs44, alpha_r - xsmulsp vs1, vs45, alpha_r - + xsmuldp vs0, vs44, alpha_r + xsmuldp vs1, vs45, alpha_r #else - - xsmulsp vs28, vs44, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs45, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs44, alpha_r + xsmaddadp vs1, vs45, alpha_r #endif stxsspx vs0, o0, T1 @@ -3347,17 +2089,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs46, alpha_r - xsmulsp vs1, vs47, alpha_r - + xsmuldp vs0, vs46, alpha_r + xsmuldp vs1, vs47, alpha_r #else - - xsmulsp vs28, vs46, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs47, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs46, alpha_r + xsmaddadp vs1, vs47, alpha_r #endif stxsspx vs0, o0, T1 @@ -3383,18 +2119,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - addi BO, BO, 32 + + addi BO, BO, 128 .endm @@ -3408,35 +2145,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs20, o0, T1 - lxsspx vs21, o4, T1 - lxsspx vs22, o8, T1 - lxsspx vs23, o12, T1 + lxsspx vs21, o16, T1 + lxsspx vs22, o32, T1 + lxsspx vs23, o48, T1 - addi BO, BO, 32 + + addi BO, BO, 128 - xsmulsp vs32, vs0, vs8 + xsmuldp vs32, vs0, vs8 - xsmulsp vs33, vs0, vs9 + xsmuldp vs33, vs0, vs9 - xsmulsp vs34, vs0, vs10 + xsmuldp vs34, vs0, vs10 - xsmulsp vs35, vs0, vs11 + xsmuldp vs35, vs0, vs11 - xsmulsp vs36, vs0, vs12 + xsmuldp vs36, vs0, vs12 - xsmulsp vs37, vs0, vs13 + xsmuldp vs37, vs0, vs13 - xsmulsp vs38, vs0, vs14 + xsmuldp vs38, vs0, vs14 - xsmulsp vs39, vs0, vs15 + xsmuldp vs39, vs0, vs15 .endm @@ -3451,35 +2189,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs20, o0, T1 - lxsspx vs21, o4, T1 - lxsspx vs22, o8, T1 - lxsspx vs23, o12, T1 + lxsspx vs21, o16, T1 + lxsspx vs22, o32, T1 + lxsspx vs23, o48, T1 - addi BO, BO, 32 + + addi BO, BO, 128 - xsmaddasp vs32, vs0, vs8 + xsmaddadp vs32, vs0, vs8 - xsmaddasp vs33, vs0, vs9 + xsmaddadp vs33, vs0, vs9 - xsmaddasp vs34, vs0, vs10 + xsmaddadp vs34, vs0, vs10 - xsmaddasp vs35, vs0, vs11 + xsmaddadp vs35, vs0, vs11 - xsmaddasp vs36, vs0, vs12 + xsmaddadp vs36, vs0, vs12 - xsmaddasp vs37, vs0, vs13 + xsmaddadp vs37, vs0, vs13 - xsmaddasp vs38, vs0, vs14 + xsmaddadp vs38, vs0, vs14 - xsmaddasp vs39, vs0, vs15 + xsmaddadp vs39, vs0, vs15 .endm @@ -3494,35 +2233,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - addi BO, BO, 32 + + addi BO, BO, 128 - xsmaddasp vs32, vs4, vs16 + xsmaddadp vs32, vs4, vs16 - xsmaddasp vs33, vs4, vs17 + xsmaddadp vs33, vs4, vs17 - xsmaddasp vs34, vs4, vs18 + xsmaddadp vs34, vs4, vs18 - xsmaddasp vs35, vs4, vs19 + xsmaddadp vs35, vs4, vs19 - xsmaddasp vs36, vs4, vs20 + xsmaddadp vs36, vs4, vs20 - xsmaddasp vs37, vs4, vs21 + xsmaddadp vs37, vs4, vs21 - xsmaddasp vs38, vs4, vs22 + xsmaddadp vs38, vs4, vs22 - xsmaddasp vs39, vs4, vs23 + xsmaddadp vs39, vs4, vs23 .endm @@ -3530,21 +2270,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL8x1_E2 - xsmaddasp vs32, vs4, vs16 + xsmaddadp vs32, vs4, vs16 - xsmaddasp vs33, vs4, vs17 + xsmaddadp vs33, vs4, vs17 - xsmaddasp vs34, vs4, vs18 + xsmaddadp vs34, vs4, vs18 - xsmaddasp vs35, vs4, vs19 + xsmaddadp vs35, vs4, vs19 - xsmaddasp vs36, vs4, vs20 + xsmaddadp vs36, vs4, vs20 - xsmaddasp vs37, vs4, vs21 + xsmaddadp vs37, vs4, vs21 - xsmaddasp vs38, vs4, vs22 + xsmaddadp vs38, vs4, vs22 - xsmaddasp vs39, vs4, vs23 + xsmaddadp vs39, vs4, vs23 .endm @@ -3559,35 +2299,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - addi BO, BO, 32 + + addi BO, BO, 128 - xsmulsp vs32, vs0, vs8 + xsmuldp vs32, vs0, vs8 - xsmulsp vs33, vs0, vs9 + xsmuldp vs33, vs0, vs9 - xsmulsp vs34, vs0, vs10 + xsmuldp vs34, vs0, vs10 - xsmulsp vs35, vs0, vs11 + xsmuldp vs35, vs0, vs11 - xsmulsp vs36, vs0, vs12 + xsmuldp vs36, vs0, vs12 - xsmulsp vs37, vs0, vs13 + xsmuldp vs37, vs0, vs13 - xsmulsp vs38, vs0, vs14 + xsmuldp vs38, vs0, vs14 - xsmulsp vs39, vs0, vs15 + xsmuldp vs39, vs0, vs15 .endm @@ -3602,35 +2343,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi T1, T1, 16 + addi T1, T1, 64 lxsspx vs12, o0, T1 - lxsspx vs13, o4, T1 - lxsspx vs14, o8, T1 - lxsspx vs15, o12, T1 + lxsspx vs13, o16, T1 + lxsspx vs14, o32, T1 + lxsspx vs15, o48, T1 - addi BO, BO, 32 + + addi BO, BO, 128 - xsmaddasp vs32, vs0, vs8 + xsmaddadp vs32, vs0, vs8 - xsmaddasp vs33, vs0, vs9 + xsmaddadp vs33, vs0, vs9 - xsmaddasp vs34, vs0, vs10 + xsmaddadp vs34, vs0, vs10 - xsmaddasp vs35, vs0, vs11 + xsmaddadp vs35, vs0, vs11 - xsmaddasp vs36, vs0, vs12 + xsmaddadp vs36, vs0, vs12 - xsmaddasp vs37, vs0, vs13 + xsmaddadp vs37, vs0, vs13 - xsmaddasp vs38, vs0, vs14 + xsmaddadp vs38, vs0, vs14 - xsmaddasp vs39, vs0, vs15 + xsmaddadp vs39, vs0, vs15 .endm @@ -3646,14 +2388,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - + xsmuldp vs0, vs32, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 @@ -3668,14 +2405,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs33, alpha_r - + xsmuldp vs0, vs33, alpha_r #else - - xsmulsp vs28, vs33, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs33, alpha_r #endif stxsspx vs0, o0, T1 @@ -3690,14 +2422,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs34, alpha_r - + xsmuldp vs0, vs34, alpha_r #else - - xsmulsp vs28, vs34, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs34, alpha_r #endif stxsspx vs0, o0, T1 @@ -3712,14 +2439,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs35, alpha_r - + xsmuldp vs0, vs35, alpha_r #else - - xsmulsp vs28, vs35, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs35, alpha_r #endif stxsspx vs0, o0, T1 @@ -3734,14 +2456,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs36, alpha_r - + xsmuldp vs0, vs36, alpha_r #else - - xsmulsp vs28, vs36, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs36, alpha_r #endif stxsspx vs0, o0, T1 @@ -3756,14 +2473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs37, alpha_r - + xsmuldp vs0, vs37, alpha_r #else - - xsmulsp vs28, vs37, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs37, alpha_r #endif stxsspx vs0, o0, T1 @@ -3778,14 +2490,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs38, alpha_r - + xsmuldp vs0, vs38, alpha_r #else - - xsmulsp vs28, vs38, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs38, alpha_r #endif stxsspx vs0, o0, T1 @@ -3800,14 +2507,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs39, alpha_r - + xsmuldp vs0, vs39, alpha_r #else - - xsmulsp vs28, vs39, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs39, alpha_r #endif stxsspx vs0, o0, T1 @@ -3832,14 +2534,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 .endm @@ -3853,14 +2555,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmulsp vs32, vs0, vs8 @@ -3896,14 +2598,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 @@ -3939,14 +2641,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs4, vs16 @@ -4008,14 +2710,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmulsp vs32, vs0, vs8 @@ -4051,14 +2753,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 @@ -4097,106 +2799,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -4214,106 +2828,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs36, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs37, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs38, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs39, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -4331,106 +2857,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs40, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs41, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs42, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs43, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -4448,105 +2886,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs44, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs45, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs46, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs47, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr +#else + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr +#endif stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -4571,14 +2921,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 .endm @@ -4590,14 +2940,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmulsp vs32, vs0, vs8 @@ -4623,14 +2973,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 @@ -4656,14 +3006,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs4, vs16 @@ -4707,14 +3057,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmulsp vs32, vs0, vs8 @@ -4740,14 +3090,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 @@ -4776,58 +3126,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -4841,58 +3147,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -4906,58 +3168,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs36, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs37, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -4971,58 +3189,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs38, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs39, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -5043,14 +3217,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 .endm @@ -5061,14 +3235,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmulsp vs32, vs0, vs8 @@ -5089,14 +3263,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 - xxspltw vs18, vs28, 2 - xxspltw vs19, vs28, 3 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 + lxvw4x vs18, o32, T1 + lxvw4x vs19, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 @@ -5117,14 +3291,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs4, vs16 @@ -5159,14 +3333,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmulsp vs32, vs0, vs8 @@ -5187,14 +3361,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 - xxspltw vs10, vs28, 2 - xxspltw vs11, vs28, 3 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 + lxvw4x vs10, o32, T1 + lxvw4x vs11, o48, T1 - addi BO, BO, 16 + addi BO, BO, 64 xvmaddasp vs32, vs0, vs8 @@ -5218,34 +3392,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -5257,34 +3409,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs33, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs33, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -5296,34 +3426,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs34, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs34, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -5335,34 +3443,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs35, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -5386,11 +3472,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi BO, BO, 16 + + addi BO, BO, 64 .endm @@ -5405,24 +3492,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - addi BO, BO, 16 + + addi BO, BO, 64 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 - xsmulsp vs34, vs0, vs9 - xsmulsp vs35, vs1, vs9 + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 - xsmulsp vs36, vs0, vs10 - xsmulsp vs37, vs1, vs10 + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 - xsmulsp vs38, vs0, vs11 - xsmulsp vs39, vs1, vs11 + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 .endm @@ -5438,24 +3526,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - addi BO, BO, 16 + + addi BO, BO, 64 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 - xsmaddasp vs34, vs0, vs9 - xsmaddasp vs35, vs1, vs9 + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 - xsmaddasp vs36, vs0, vs10 - xsmaddasp vs37, vs1, vs10 + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 - xsmaddasp vs38, vs0, vs11 - xsmaddasp vs39, vs1, vs11 + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 .endm @@ -5471,24 +3560,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi BO, BO, 16 + + addi BO, BO, 64 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 - xsmaddasp vs34, vs4, vs17 - xsmaddasp vs35, vs5, vs17 + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 - xsmaddasp vs36, vs4, vs18 - xsmaddasp vs37, vs5, vs18 + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 - xsmaddasp vs38, vs4, vs19 - xsmaddasp vs39, vs5, vs19 + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 .endm @@ -5496,17 +3586,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x2_E2 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 - xsmaddasp vs34, vs4, vs17 - xsmaddasp vs35, vs5, vs17 + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 - xsmaddasp vs36, vs4, vs18 - xsmaddasp vs37, vs5, vs18 + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 - xsmaddasp vs38, vs4, vs19 - xsmaddasp vs39, vs5, vs19 + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 .endm @@ -5522,24 +3612,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi BO, BO, 16 + + addi BO, BO, 64 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 - xsmulsp vs34, vs0, vs9 - xsmulsp vs35, vs1, vs9 + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 - xsmulsp vs36, vs0, vs10 - xsmulsp vs37, vs1, vs10 + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 - xsmulsp vs38, vs0, vs11 - xsmulsp vs39, vs1, vs11 + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 .endm @@ -5555,24 +3646,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi BO, BO, 16 + + addi BO, BO, 64 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 - xsmaddasp vs34, vs0, vs9 - xsmaddasp vs35, vs1, vs9 + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 - xsmaddasp vs36, vs0, vs10 - xsmaddasp vs37, vs1, vs10 + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 - xsmaddasp vs38, vs0, vs11 - xsmaddasp vs39, vs1, vs11 + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 .endm @@ -5589,17 +3681,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - xsmulsp vs1, vs33, alpha_r - + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs33, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 @@ -5616,17 +3702,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs34, alpha_r - xsmulsp vs1, vs35, alpha_r - + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r #else - - xsmulsp vs28, vs34, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs35, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r #endif stxsspx vs0, o0, T1 @@ -5643,17 +3723,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs36, alpha_r - xsmulsp vs1, vs37, alpha_r - + xsmuldp vs0, vs36, alpha_r + xsmuldp vs1, vs37, alpha_r #else - - xsmulsp vs28, vs36, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs37, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs36, alpha_r + xsmaddadp vs1, vs37, alpha_r #endif stxsspx vs0, o0, T1 @@ -5670,17 +3744,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs38, alpha_r - xsmulsp vs1, vs39, alpha_r - + xsmuldp vs0, vs38, alpha_r + xsmuldp vs1, vs39, alpha_r #else - - xsmulsp vs28, vs38, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs39, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs38, alpha_r + xsmaddadp vs1, vs39, alpha_r #endif stxsspx vs0, o0, T1 @@ -5706,11 +3774,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi BO, BO, 16 + + addi BO, BO, 64 .endm @@ -5724,20 +3793,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - addi BO, BO, 16 + + addi BO, BO, 64 - xsmulsp vs32, vs0, vs8 + xsmuldp vs32, vs0, vs8 - xsmulsp vs33, vs0, vs9 + xsmuldp vs33, vs0, vs9 - xsmulsp vs34, vs0, vs10 + xsmuldp vs34, vs0, vs10 - xsmulsp vs35, vs0, vs11 + xsmuldp vs35, vs0, vs11 .endm @@ -5752,20 +3822,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 - lxsspx vs18, o8, T1 - lxsspx vs19, o12, T1 + lxsspx vs17, o16, T1 + lxsspx vs18, o32, T1 + lxsspx vs19, o48, T1 - addi BO, BO, 16 + + addi BO, BO, 64 - xsmaddasp vs32, vs0, vs8 + xsmaddadp vs32, vs0, vs8 - xsmaddasp vs33, vs0, vs9 + xsmaddadp vs33, vs0, vs9 - xsmaddasp vs34, vs0, vs10 + xsmaddadp vs34, vs0, vs10 - xsmaddasp vs35, vs0, vs11 + xsmaddadp vs35, vs0, vs11 .endm @@ -5780,20 +3851,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi BO, BO, 16 + + addi BO, BO, 64 - xsmaddasp vs32, vs4, vs16 + xsmaddadp vs32, vs4, vs16 - xsmaddasp vs33, vs4, vs17 + xsmaddadp vs33, vs4, vs17 - xsmaddasp vs34, vs4, vs18 + xsmaddadp vs34, vs4, vs18 - xsmaddasp vs35, vs4, vs19 + xsmaddadp vs35, vs4, vs19 .endm @@ -5801,13 +3873,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x1_E2 - xsmaddasp vs32, vs4, vs16 + xsmaddadp vs32, vs4, vs16 - xsmaddasp vs33, vs4, vs17 + xsmaddadp vs33, vs4, vs17 - xsmaddasp vs34, vs4, vs18 + xsmaddadp vs34, vs4, vs18 - xsmaddasp vs35, vs4, vs19 + xsmaddadp vs35, vs4, vs19 .endm @@ -5822,20 +3894,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi BO, BO, 16 + + addi BO, BO, 64 - xsmulsp vs32, vs0, vs8 + xsmuldp vs32, vs0, vs8 - xsmulsp vs33, vs0, vs9 + xsmuldp vs33, vs0, vs9 - xsmulsp vs34, vs0, vs10 + xsmuldp vs34, vs0, vs10 - xsmulsp vs35, vs0, vs11 + xsmuldp vs35, vs0, vs11 .endm @@ -5850,20 +3923,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 - lxsspx vs10, o8, T1 - lxsspx vs11, o12, T1 + lxsspx vs9, o16, T1 + lxsspx vs10, o32, T1 + lxsspx vs11, o48, T1 - addi BO, BO, 16 + + addi BO, BO, 64 - xsmaddasp vs32, vs0, vs8 + xsmaddadp vs32, vs0, vs8 - xsmaddasp vs33, vs0, vs9 + xsmaddadp vs33, vs0, vs9 - xsmaddasp vs34, vs0, vs10 + xsmaddadp vs34, vs0, vs10 - xsmaddasp vs35, vs0, vs11 + xsmaddadp vs35, vs0, vs11 .endm @@ -5879,14 +3953,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - + xsmuldp vs0, vs32, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 @@ -5901,14 +3970,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs33, alpha_r - + xsmuldp vs0, vs33, alpha_r #else - - xsmulsp vs28, vs33, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs33, alpha_r #endif stxsspx vs0, o0, T1 @@ -5923,14 +3987,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs34, alpha_r - + xsmuldp vs0, vs34, alpha_r #else - - xsmulsp vs28, vs34, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs34, alpha_r #endif stxsspx vs0, o0, T1 @@ -5945,14 +4004,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs35, alpha_r - + xsmuldp vs0, vs35, alpha_r #else - - xsmulsp vs28, vs35, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs35, alpha_r #endif stxsspx vs0, o0, T1 @@ -5977,12 +4031,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 .endm @@ -5996,12 +4050,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 @@ -6027,12 +4081,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 @@ -6058,12 +4112,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 @@ -6105,12 +4159,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 @@ -6136,12 +4190,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 @@ -6170,223 +4224,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 -#endif - - - - - stxvw4x vs0, o0, T1 - stxvw4x vs1, o16, T1 - stxvw4x vs2, o32, T1 - stxvw4x vs3, o48, T1 - - add T1, T1, LDC - - -#ifndef TRMMKERNEL - - lxvw4x vs0, o0, T1 - lxvw4x vs1, o16, T1 - lxvw4x vs2, o32, T1 - lxvw4x vs3, o48, T1 - -#endif - - - stxvw4x vs36, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs37, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr #endif - stxvw4x vs38, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif + add T1, T1, LDC - stxvw4x vs39, o0, TBUFFER - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER +#ifndef TRMMKERNEL - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER +#endif #ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -6410,12 +4288,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 .endm @@ -6427,12 +4305,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 @@ -6452,12 +4330,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 @@ -6477,12 +4355,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 @@ -6514,12 +4392,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 @@ -6539,12 +4417,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 @@ -6567,58 +4445,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -6632,58 +4466,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -6704,12 +4494,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 .endm @@ -6720,12 +4510,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 @@ -6742,12 +4532,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 - xxspltw vs17, vs28, 1 + lxvw4x vs16, o0, T1 + lxvw4x vs17, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 @@ -6764,12 +4554,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 @@ -6796,12 +4586,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 @@ -6818,12 +4608,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 - xxspltw vs9, vs28, 1 + lxvw4x vs8, o0, T1 + lxvw4x vs9, o16, T1 - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 @@ -6843,34 +4633,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -6882,34 +4650,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs33, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs33, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -6933,9 +4679,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 + lxsspx vs9, o16, T1 - addi BO, BO, 8 + + addi BO, BO, 32 .endm @@ -6950,16 +4697,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 + lxsspx vs17, o16, T1 - addi BO, BO, 8 + + addi BO, BO, 32 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 - xsmulsp vs34, vs0, vs9 - xsmulsp vs35, vs1, vs9 + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 .endm @@ -6975,16 +4723,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 + lxsspx vs17, o16, T1 - addi BO, BO, 8 + + addi BO, BO, 32 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 - xsmaddasp vs34, vs0, vs9 - xsmaddasp vs35, vs1, vs9 + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 .endm @@ -7000,16 +4749,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 + lxsspx vs9, o16, T1 - addi BO, BO, 8 + + addi BO, BO, 32 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 - xsmaddasp vs34, vs4, vs17 - xsmaddasp vs35, vs5, vs17 + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 .endm @@ -7017,11 +4767,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x2_E2 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 - xsmaddasp vs34, vs4, vs17 - xsmaddasp vs35, vs5, vs17 + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 .endm @@ -7037,16 +4787,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 + lxsspx vs9, o16, T1 - addi BO, BO, 8 + + addi BO, BO, 32 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 - xsmulsp vs34, vs0, vs9 - xsmulsp vs35, vs1, vs9 + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 .endm @@ -7062,16 +4813,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 + lxsspx vs9, o16, T1 - addi BO, BO, 8 + + addi BO, BO, 32 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 - xsmaddasp vs34, vs0, vs9 - xsmaddasp vs35, vs1, vs9 + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 .endm @@ -7088,17 +4840,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - xsmulsp vs1, vs33, alpha_r - + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs33, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 @@ -7115,17 +4861,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs34, alpha_r - xsmulsp vs1, vs35, alpha_r - + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r #else - - xsmulsp vs28, vs34, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs35, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r #endif stxsspx vs0, o0, T1 @@ -7151,9 +4891,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 + lxsspx vs9, o16, T1 - addi BO, BO, 8 + + addi BO, BO, 32 .endm @@ -7167,14 +4908,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 + lxsspx vs17, o16, T1 - addi BO, BO, 8 + + addi BO, BO, 32 - xsmulsp vs32, vs0, vs8 + xsmuldp vs32, vs0, vs8 - xsmulsp vs33, vs0, vs9 + xsmuldp vs33, vs0, vs9 .endm @@ -7189,14 +4931,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs16, o0, T1 - lxsspx vs17, o4, T1 + lxsspx vs17, o16, T1 - addi BO, BO, 8 + + addi BO, BO, 32 - xsmaddasp vs32, vs0, vs8 + xsmaddadp vs32, vs0, vs8 - xsmaddasp vs33, vs0, vs9 + xsmaddadp vs33, vs0, vs9 .endm @@ -7211,14 +4954,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 + lxsspx vs9, o16, T1 - addi BO, BO, 8 + + addi BO, BO, 32 - xsmaddasp vs32, vs4, vs16 + xsmaddadp vs32, vs4, vs16 - xsmaddasp vs33, vs4, vs17 + xsmaddadp vs33, vs4, vs17 .endm @@ -7226,9 +4970,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x1_E2 - xsmaddasp vs32, vs4, vs16 + xsmaddadp vs32, vs4, vs16 - xsmaddasp vs33, vs4, vs17 + xsmaddadp vs33, vs4, vs17 .endm @@ -7243,14 +4987,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 + lxsspx vs9, o16, T1 - addi BO, BO, 8 + + addi BO, BO, 32 - xsmulsp vs32, vs0, vs8 + xsmuldp vs32, vs0, vs8 - xsmulsp vs33, vs0, vs9 + xsmuldp vs33, vs0, vs9 .endm @@ -7265,14 +5010,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr T1, BO lxsspx vs8, o0, T1 - lxsspx vs9, o4, T1 + lxsspx vs9, o16, T1 - addi BO, BO, 8 + + addi BO, BO, 32 - xsmaddasp vs32, vs0, vs8 + xsmaddadp vs32, vs0, vs8 - xsmaddasp vs33, vs0, vs9 + xsmaddadp vs33, vs0, vs9 .endm @@ -7288,14 +5034,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - + xsmuldp vs0, vs32, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 @@ -7310,14 +5051,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs33, alpha_r - + xsmuldp vs0, vs33, alpha_r #else - - xsmulsp vs28, vs33, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs33, alpha_r #endif stxsspx vs0, o0, T1 @@ -7342,11 +5078,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 .endm @@ -7360,11 +5096,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 + lxvw4x vs16, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmulsp vs32, vs0, vs8 @@ -7385,11 +5121,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 + lxvw4x vs16, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 @@ -7410,11 +5146,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 @@ -7446,11 +5182,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmulsp vs32, vs0, vs8 @@ -7471,11 +5207,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 @@ -7499,106 +5235,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 -#endif - - stxvw4x vs34, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs2, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs2, vs2, vs28 -#endif - - stxvw4x vs35, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs3, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs3, vs3, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 stxvw4x vs2, o32, T1 @@ -7622,11 +5270,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 .endm @@ -7638,11 +5286,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 + lxvw4x vs16, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmulsp vs32, vs0, vs8 @@ -7659,11 +5307,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 + lxvw4x vs16, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 @@ -7680,11 +5328,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 @@ -7710,11 +5358,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmulsp vs32, vs0, vs8 @@ -7731,11 +5379,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 @@ -7755,58 +5403,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - -#ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER -#else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 -#endif - - stxvw4x vs33, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs1, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs1, vs1, vs28 + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr #endif - - - stxvw4x vs0, o0, T1 stxvw4x vs1, o16, T1 @@ -7827,11 +5431,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 .endm @@ -7842,11 +5446,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 + lxvw4x vs16, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmulsp vs32, vs0, vs8 @@ -7861,11 +5465,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs16, vs28, 0 + lxvw4x vs16, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 @@ -7880,11 +5484,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 @@ -7907,11 +5511,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmulsp vs32, vs0, vs8 @@ -7926,11 +5530,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvw4x vs28, o0, BO + mr T1, BO - xxspltw vs8, vs28, 0 + lxvw4x vs8, o0, T1 - addi BO, BO, 4 + addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 @@ -7948,34 +5552,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - - stxvw4x vs32, o0, TBUFFER - - lxsspx vs4, o0, TBUFFER - lxsspx vs5, o4, TBUFFER - lxsspx vs6, o8, TBUFFER - lxsspx vs7, o12, TBUFFER - - xsmulsp vs4, vs4, alpha_r - xsmulsp vs5, vs5, alpha_r - xsmulsp vs6, vs6, alpha_r - xsmulsp vs7, vs7, alpha_r - - stxsspx vs4, o0, TBUFFER - stxsspx vs5, o4, TBUFFER - stxsspx vs6, o8, TBUFFER - stxsspx vs7, o12, TBUFFER - #ifdef TRMMKERNEL - lxvw4x vs0, o0, TBUFFER + xvmulsp vs0, vs32, alpha_vr #else - lxvw4x vs28, o0, TBUFFER - xvaddsp vs0, vs0, vs28 + xvmaddasp vs0, vs32, alpha_vr #endif - - - stxvw4x vs0, o0, T1 add T1, T1, LDC @@ -8000,7 +5582,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 .endm @@ -8016,11 +5599,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs16, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 .endm @@ -8037,11 +5621,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs16, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 .endm @@ -8058,11 +5643,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 .endm @@ -8070,8 +5656,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x2_E2 - xsmaddasp vs32, vs4, vs16 - xsmaddasp vs33, vs5, vs16 + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 .endm @@ -8088,11 +5674,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmulsp vs32, vs0, vs8 - xsmulsp vs33, vs1, vs8 + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 .endm @@ -8109,11 +5696,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmaddasp vs32, vs0, vs8 - xsmaddasp vs33, vs1, vs8 + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 .endm @@ -8130,17 +5718,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - xsmulsp vs1, vs33, alpha_r - + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - xsmulsp vs28, vs33, alpha_r - xsaddsp vs1, vs1, vs28 - + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r #endif stxsspx vs0, o0, T1 @@ -8167,7 +5749,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 .endm @@ -8182,10 +5765,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs16, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmulsp vs32, vs0, vs8 + xsmuldp vs32, vs0, vs8 .endm @@ -8201,10 +5785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs16, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmaddasp vs32, vs0, vs8 + xsmaddadp vs32, vs0, vs8 .endm @@ -8220,10 +5805,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmaddasp vs32, vs4, vs16 + xsmaddadp vs32, vs4, vs16 .endm @@ -8231,7 +5817,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x1_E2 - xsmaddasp vs32, vs4, vs16 + xsmaddadp vs32, vs4, vs16 .endm @@ -8247,10 +5833,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmulsp vs32, vs0, vs8 + xsmuldp vs32, vs0, vs8 .endm @@ -8266,10 +5853,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxsspx vs8, o0, T1 - addi BO, BO, 4 + + addi BO, BO, 16 - xsmaddasp vs32, vs0, vs8 + xsmaddadp vs32, vs0, vs8 .endm @@ -8285,14 +5873,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL - - xsmulsp vs0, vs32, alpha_r - + xsmuldp vs0, vs32, alpha_r #else - - xsmulsp vs28, vs32, alpha_r - xsaddsp vs0, vs0, vs28 - + xsmaddadp vs0, vs32, alpha_r #endif stxsspx vs0, o0, T1 diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S index 5e607c58f..f756d5d92 100644 --- a/kernel/power/strmm_kernel_16x8_power8.S +++ b/kernel/power/strmm_kernel_16x8_power8.S @@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ @@ -128,6 +128,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define alpha_r vs30 +#define alpha_vr vs31 #define o0 0 @@ -152,7 +153,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PRE r30 #define T2 r31 -#include "sgemm_macros_16x8_power8.S" +#include "strmm_macros_16x8_power8.S" #ifndef NEEDPARAM @@ -264,11 +265,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmpwi cr0, M, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, N, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, K, 0 - ble .L999_H1 + ble L999_H1 li PRE, 256 li o4 , 4 @@ -280,16 +281,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi TBUFFER, SP, 320 addi T1, SP, 300 - stfs f1, 0(T1) - - lxsspx alpha_r, 0, T1 + stxsspx f1, o0 , T1 + stxsspx f1, o4 , T1 + stxsspx f1, o8 , T1 + stxsspx f1, o12 , T1 + lxsspx alpha_r, o0, T1 + lxvw4x alpha_vr, o0, T1 #include "strmm_logic_16x8_power8.S" -.L999: +L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/strmm_logic_16x8_power8.S b/kernel/power/strmm_logic_16x8_power8.S index 8ec11f1ef..fb2d3f94b 100644 --- a/kernel/power/strmm_logic_16x8_power8.S +++ b/kernel/power/strmm_logic_16x8_power8.S @@ -26,18 +26,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ - srawi. J, N, 3 - ble .LSTRMM_L8_END + ble STRMM_L8_END -.LSTRMM_L8_BEGIN: +STRMM_L8_BEGIN: mr CO, C mr AO, A @@ -49,9 +48,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif srawi. I, M, 4 - ble .LSTRMM_L8x16_END + ble STRMM_L8x16_END -.LSTRMM_L8x16_BEGIN: +STRMM_L8x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -78,11 +77,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L8x16_SUB0 + ble STRMM_L8x16_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L8x16_SUB4 + ble STRMM_L8x16_SUB4 -.LSTRMM_L8x16_LOOP_START: +STRMM_L8x16_LOOP_START: dcbt AO, PRE LOAD8x16_1 @@ -105,11 +104,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x16_2 addic. L, L, -2 - ble .LSTRMM_L8x16_LOOP_END + ble STRMM_L8x16_LOOP_END .align 5 -.LSTRMM_L8x16_LOOP: +STRMM_L8x16_LOOP: dcbt AO, PRE KERNEL8x16_1 @@ -130,9 +129,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x16_2 addic. L, L, -1 - bgt .LSTRMM_L8x16_LOOP + bgt STRMM_L8x16_LOOP -.LSTRMM_L8x16_LOOP_END: +STRMM_L8x16_LOOP_END: dcbt AO, PRE KERNEL8x16_1 @@ -151,9 +150,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x16_1 KERNEL8x16_E2 - b .LSTRMM_L8x16_SUB1 + b STRMM_L8x16_SUB1 -.LSTRMM_L8x16_SUB4: +STRMM_L8x16_SUB4: dcbt AO, PRE KERNEL8x16_SUBI1 @@ -169,31 +168,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x16_SUB1 KERNEL8x16_SUB1 - b .LSTRMM_L8x16_SUB1 + b STRMM_L8x16_SUB1 -.LSTRMM_L8x16_SUB0: +STRMM_L8x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL8x16_SUBI1 addic. L, L, -1 - ble .LSTRMM_L8x16_SAVE - b .LSTRMM_L8x16_SUB2 + ble STRMM_L8x16_SAVE + b STRMM_L8x16_SUB2 -.LSTRMM_L8x16_SUB1: +STRMM_L8x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L8x16_SAVE + ble STRMM_L8x16_SAVE -.LSTRMM_L8x16_SUB2: +STRMM_L8x16_SUB2: KERNEL8x16_SUB1 addic. L, L, -1 - bgt .LSTRMM_L8x16_SUB2 + bgt STRMM_L8x16_SUB2 -.LSTRMM_L8x16_SAVE: +STRMM_L8x16_SAVE: SAVE8x16 @@ -211,16 +210,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. I, I, -1 - bgt .LSTRMM_L8x16_BEGIN + bgt STRMM_L8x16_BEGIN -.LSTRMM_L8x16_END: +STRMM_L8x16_END: -.LSTRMM_L8x8_BEGIN: +STRMM_L8x8_BEGIN: andi. T2, M, 15 - ble .LSTRMM_L8x1_END + ble STRMM_L8x1_END andi. T1, M, 8 - ble .LSTRMM_L8x8_END + ble STRMM_L8x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -246,11 +245,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L8x8_SUB0 + ble STRMM_L8x8_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L8x8_SUB4 + ble STRMM_L8x8_SUB4 -.LSTRMM_L8x8_LOOP_START: +STRMM_L8x8_LOOP_START: LOAD8x8_1 KERNEL8x8_I1 @@ -264,11 +263,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_2 addic. L, L, -2 - ble .LSTRMM_L8x8_LOOP_END + ble STRMM_L8x8_LOOP_END .align 5 -.LSTRMM_L8x8_LOOP: +STRMM_L8x8_LOOP: KERNEL8x8_1 KERNEL8x8_2 @@ -281,9 +280,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_2 addic. L, L, -1 - bgt .LSTRMM_L8x8_LOOP + bgt STRMM_L8x8_LOOP -.LSTRMM_L8x8_LOOP_END: +STRMM_L8x8_LOOP_END: KERNEL8x8_1 KERNEL8x8_2 @@ -295,9 +294,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_1 KERNEL8x8_E2 - b .LSTRMM_L8x8_SUB1 + b STRMM_L8x8_SUB1 -.LSTRMM_L8x8_SUB4: +STRMM_L8x8_SUB4: KERNEL8x8_SUBI1 KERNEL8x8_SUB1 @@ -309,31 +308,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x8_SUB1 KERNEL8x8_SUB1 - b .LSTRMM_L8x8_SUB1 + b STRMM_L8x8_SUB1 -.LSTRMM_L8x8_SUB0: +STRMM_L8x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL8x8_SUBI1 addic. L, L, -1 - ble .LSTRMM_L8x8_SAVE - b .LSTRMM_L8x8_SUB2 + ble STRMM_L8x8_SAVE + b STRMM_L8x8_SUB2 -.LSTRMM_L8x8_SUB1: +STRMM_L8x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L8x8_SAVE + ble STRMM_L8x8_SAVE -.LSTRMM_L8x8_SUB2: +STRMM_L8x8_SUB2: KERNEL8x8_SUB1 addic. L, L, -1 - bgt .LSTRMM_L8x8_SUB2 + bgt STRMM_L8x8_SUB2 -.LSTRMM_L8x8_SAVE: +STRMM_L8x8_SAVE: SAVE8x8 @@ -350,12 +349,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L8x8_END: +STRMM_L8x8_END: -.LSTRMM_L8x4_BEGIN: +STRMM_L8x4_BEGIN: andi. T1, M, 4 - ble .LSTRMM_L8x4_END + ble STRMM_L8x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -381,11 +380,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L8x4_SUB0 + ble STRMM_L8x4_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L8x4_SUB4 + ble STRMM_L8x4_SUB4 -.LSTRMM_L8x4_LOOP_START: +STRMM_L8x4_LOOP_START: LOAD8x4_1 KERNEL8x4_I1 @@ -399,11 +398,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_2 addic. L, L, -2 - ble .LSTRMM_L8x4_LOOP_END + ble STRMM_L8x4_LOOP_END .align 5 -.LSTRMM_L8x4_LOOP: +STRMM_L8x4_LOOP: KERNEL8x4_1 KERNEL8x4_2 @@ -416,9 +415,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_2 addic. L, L, -1 - bgt .LSTRMM_L8x4_LOOP + bgt STRMM_L8x4_LOOP -.LSTRMM_L8x4_LOOP_END: +STRMM_L8x4_LOOP_END: KERNEL8x4_1 KERNEL8x4_2 @@ -430,9 +429,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_1 KERNEL8x4_E2 - b .LSTRMM_L8x4_SUB1 + b STRMM_L8x4_SUB1 -.LSTRMM_L8x4_SUB4: +STRMM_L8x4_SUB4: KERNEL8x4_SUBI1 KERNEL8x4_SUB1 @@ -444,31 +443,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x4_SUB1 KERNEL8x4_SUB1 - b .LSTRMM_L8x4_SUB1 + b STRMM_L8x4_SUB1 -.LSTRMM_L8x4_SUB0: +STRMM_L8x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL8x4_SUBI1 addic. L, L, -1 - ble .LSTRMM_L8x4_SAVE - b .LSTRMM_L8x4_SUB2 + ble STRMM_L8x4_SAVE + b STRMM_L8x4_SUB2 -.LSTRMM_L8x4_SUB1: +STRMM_L8x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L8x4_SAVE + ble STRMM_L8x4_SAVE -.LSTRMM_L8x4_SUB2: +STRMM_L8x4_SUB2: KERNEL8x4_SUB1 addic. L, L, -1 - bgt .LSTRMM_L8x4_SUB2 + bgt STRMM_L8x4_SUB2 -.LSTRMM_L8x4_SAVE: +STRMM_L8x4_SAVE: SAVE8x4 @@ -485,12 +484,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L8x4_END: +STRMM_L8x4_END: -.LSTRMM_L8x2_BEGIN: +STRMM_L8x2_BEGIN: andi. T1, M, 2 - ble .LSTRMM_L8x2_END + ble STRMM_L8x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -516,11 +515,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L8x2_SUB0 + ble STRMM_L8x2_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L8x2_SUB4 + ble STRMM_L8x2_SUB4 -.LSTRMM_L8x2_LOOP_START: +STRMM_L8x2_LOOP_START: LOAD8x2_1 KERNEL8x2_I1 @@ -534,11 +533,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_2 addic. L, L, -2 - ble .LSTRMM_L8x2_LOOP_END + ble STRMM_L8x2_LOOP_END .align 5 -.LSTRMM_L8x2_LOOP: +STRMM_L8x2_LOOP: KERNEL8x2_1 KERNEL8x2_2 @@ -551,9 +550,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_2 addic. L, L, -1 - bgt .LSTRMM_L8x2_LOOP + bgt STRMM_L8x2_LOOP -.LSTRMM_L8x2_LOOP_END: +STRMM_L8x2_LOOP_END: KERNEL8x2_1 KERNEL8x2_2 @@ -565,9 +564,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_1 KERNEL8x2_E2 - b .LSTRMM_L8x2_SUB1 + b STRMM_L8x2_SUB1 -.LSTRMM_L8x2_SUB4: +STRMM_L8x2_SUB4: KERNEL8x2_SUBI1 KERNEL8x2_SUB1 @@ -579,31 +578,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x2_SUB1 KERNEL8x2_SUB1 - b .LSTRMM_L8x2_SUB1 + b STRMM_L8x2_SUB1 -.LSTRMM_L8x2_SUB0: +STRMM_L8x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL8x2_SUBI1 addic. L, L, -1 - ble .LSTRMM_L8x2_SAVE - b .LSTRMM_L8x2_SUB2 + ble STRMM_L8x2_SAVE + b STRMM_L8x2_SUB2 -.LSTRMM_L8x2_SUB1: +STRMM_L8x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L8x2_SAVE + ble STRMM_L8x2_SAVE -.LSTRMM_L8x2_SUB2: +STRMM_L8x2_SUB2: KERNEL8x2_SUB1 addic. L, L, -1 - bgt .LSTRMM_L8x2_SUB2 + bgt STRMM_L8x2_SUB2 -.LSTRMM_L8x2_SAVE: +STRMM_L8x2_SAVE: SAVE8x2 @@ -620,12 +619,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L8x2_END: +STRMM_L8x2_END: -.LSTRMM_L8x1_BEGIN: +STRMM_L8x1_BEGIN: andi. T1, M, 1 - ble .LSTRMM_L8x1_END + ble STRMM_L8x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -651,11 +650,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L8x1_SUB0 + ble STRMM_L8x1_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L8x1_SUB4 + ble STRMM_L8x1_SUB4 -.LSTRMM_L8x1_LOOP_START: +STRMM_L8x1_LOOP_START: LOAD8x1_1 KERNEL8x1_I1 @@ -669,11 +668,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_2 addic. L, L, -2 - ble .LSTRMM_L8x1_LOOP_END + ble STRMM_L8x1_LOOP_END .align 5 -.LSTRMM_L8x1_LOOP: +STRMM_L8x1_LOOP: KERNEL8x1_1 KERNEL8x1_2 @@ -686,9 +685,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_2 addic. L, L, -1 - bgt .LSTRMM_L8x1_LOOP + bgt STRMM_L8x1_LOOP -.LSTRMM_L8x1_LOOP_END: +STRMM_L8x1_LOOP_END: KERNEL8x1_1 KERNEL8x1_2 @@ -700,9 +699,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_1 KERNEL8x1_E2 - b .LSTRMM_L8x1_SUB1 + b STRMM_L8x1_SUB1 -.LSTRMM_L8x1_SUB4: +STRMM_L8x1_SUB4: KERNEL8x1_SUBI1 KERNEL8x1_SUB1 @@ -714,31 +713,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL8x1_SUB1 KERNEL8x1_SUB1 - b .LSTRMM_L8x1_SUB1 + b STRMM_L8x1_SUB1 -.LSTRMM_L8x1_SUB0: +STRMM_L8x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL8x1_SUBI1 addic. L, L, -1 - ble .LSTRMM_L8x1_SAVE - b .LSTRMM_L8x1_SUB2 + ble STRMM_L8x1_SAVE + b STRMM_L8x1_SUB2 -.LSTRMM_L8x1_SUB1: +STRMM_L8x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L8x1_SAVE + ble STRMM_L8x1_SAVE -.LSTRMM_L8x1_SUB2: +STRMM_L8x1_SUB2: KERNEL8x1_SUB1 addic. L, L, -1 - bgt .LSTRMM_L8x1_SUB2 + bgt STRMM_L8x1_SUB2 -.LSTRMM_L8x1_SAVE: +STRMM_L8x1_SAVE: SAVE8x1 @@ -755,7 +754,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L8x1_END: +STRMM_L8x1_END: slwi T1, K, 5 add B, B, T1 @@ -766,23 +765,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. J, J, -1 - bgt .LSTRMM_L8_BEGIN + bgt STRMM_L8_BEGIN andi. T2, N, 7 - ble .L999 + ble L999 -.LSTRMM_L8_END: +STRMM_L8_END: - b .LSTRMM_L4_BEGIN + b STRMM_L4_BEGIN -.L999_H1: +L999_H1: - b .L999 + b L999 -.LSTRMM_L4_BEGIN: +STRMM_L4_BEGIN: andi. T1, N, 4 - ble .LSTRMM_L4_END + ble STRMM_L4_END mr CO, C mr AO, A slwi T1, LDC , 2 @@ -793,9 +792,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif srawi. I, M, 4 - ble .LSTRMM_L4x16_END + ble STRMM_L4x16_END -.LSTRMM_L4x16_BEGIN: +STRMM_L4x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -822,11 +821,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L4x16_SUB0 + ble STRMM_L4x16_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L4x16_SUB4 + ble STRMM_L4x16_SUB4 -.LSTRMM_L4x16_LOOP_START: +STRMM_L4x16_LOOP_START: dcbt AO, PRE LOAD4x16_1 @@ -849,11 +848,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_2 addic. L, L, -2 - ble .LSTRMM_L4x16_LOOP_END + ble STRMM_L4x16_LOOP_END .align 5 -.LSTRMM_L4x16_LOOP: +STRMM_L4x16_LOOP: dcbt AO, PRE KERNEL4x16_1 @@ -874,9 +873,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_2 addic. L, L, -1 - bgt .LSTRMM_L4x16_LOOP + bgt STRMM_L4x16_LOOP -.LSTRMM_L4x16_LOOP_END: +STRMM_L4x16_LOOP_END: dcbt AO, PRE KERNEL4x16_1 @@ -895,9 +894,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_1 KERNEL4x16_E2 - b .LSTRMM_L4x16_SUB1 + b STRMM_L4x16_SUB1 -.LSTRMM_L4x16_SUB4: +STRMM_L4x16_SUB4: dcbt AO, PRE KERNEL4x16_SUBI1 @@ -913,31 +912,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x16_SUB1 KERNEL4x16_SUB1 - b .LSTRMM_L4x16_SUB1 + b STRMM_L4x16_SUB1 -.LSTRMM_L4x16_SUB0: +STRMM_L4x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x16_SUBI1 addic. L, L, -1 - ble .LSTRMM_L4x16_SAVE - b .LSTRMM_L4x16_SUB2 + ble STRMM_L4x16_SAVE + b STRMM_L4x16_SUB2 -.LSTRMM_L4x16_SUB1: +STRMM_L4x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L4x16_SAVE + ble STRMM_L4x16_SAVE -.LSTRMM_L4x16_SUB2: +STRMM_L4x16_SUB2: KERNEL4x16_SUB1 addic. L, L, -1 - bgt .LSTRMM_L4x16_SUB2 + bgt STRMM_L4x16_SUB2 -.LSTRMM_L4x16_SAVE: +STRMM_L4x16_SAVE: SAVE4x16 @@ -955,16 +954,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. I, I, -1 - bgt .LSTRMM_L4x16_BEGIN + bgt STRMM_L4x16_BEGIN -.LSTRMM_L4x16_END: +STRMM_L4x16_END: -.LSTRMM_L4x8_BEGIN: +STRMM_L4x8_BEGIN: andi. T2, M, 15 - ble .LSTRMM_L4x1_END + ble STRMM_L4x1_END andi. T1, M, 8 - ble .LSTRMM_L4x8_END + ble STRMM_L4x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -990,11 +989,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L4x8_SUB0 + ble STRMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L4x8_SUB4 + ble STRMM_L4x8_SUB4 -.LSTRMM_L4x8_LOOP_START: +STRMM_L4x8_LOOP_START: LOAD4x8_1 KERNEL4x8_I1 @@ -1008,11 +1007,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -2 - ble .LSTRMM_L4x8_LOOP_END + ble STRMM_L4x8_LOOP_END .align 5 -.LSTRMM_L4x8_LOOP: +STRMM_L4x8_LOOP: KERNEL4x8_1 KERNEL4x8_2 @@ -1025,9 +1024,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -1 - bgt .LSTRMM_L4x8_LOOP + bgt STRMM_L4x8_LOOP -.LSTRMM_L4x8_LOOP_END: +STRMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 @@ -1039,9 +1038,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 KERNEL4x8_E2 - b .LSTRMM_L4x8_SUB1 + b STRMM_L4x8_SUB1 -.LSTRMM_L4x8_SUB4: +STRMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -1053,31 +1052,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b .LSTRMM_L4x8_SUB1 + b STRMM_L4x8_SUB1 -.LSTRMM_L4x8_SUB0: +STRMM_L4x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x8_SUBI1 addic. L, L, -1 - ble .LSTRMM_L4x8_SAVE - b .LSTRMM_L4x8_SUB2 + ble STRMM_L4x8_SAVE + b STRMM_L4x8_SUB2 -.LSTRMM_L4x8_SUB1: +STRMM_L4x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L4x8_SAVE + ble STRMM_L4x8_SAVE -.LSTRMM_L4x8_SUB2: +STRMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt .LSTRMM_L4x8_SUB2 + bgt STRMM_L4x8_SUB2 -.LSTRMM_L4x8_SAVE: +STRMM_L4x8_SAVE: SAVE4x8 @@ -1094,12 +1093,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L4x8_END: +STRMM_L4x8_END: -.LSTRMM_L4x4_BEGIN: +STRMM_L4x4_BEGIN: andi. T1, M, 4 - ble .LSTRMM_L4x4_END + ble STRMM_L4x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1125,11 +1124,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L4x4_SUB0 + ble STRMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L4x4_SUB4 + ble STRMM_L4x4_SUB4 -.LSTRMM_L4x4_LOOP_START: +STRMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 @@ -1143,11 +1142,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -2 - ble .LSTRMM_L4x4_LOOP_END + ble STRMM_L4x4_LOOP_END .align 5 -.LSTRMM_L4x4_LOOP: +STRMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 @@ -1160,9 +1159,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -1 - bgt .LSTRMM_L4x4_LOOP + bgt STRMM_L4x4_LOOP -.LSTRMM_L4x4_LOOP_END: +STRMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -1174,9 +1173,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_1 KERNEL4x4_E2 - b .LSTRMM_L4x4_SUB1 + b STRMM_L4x4_SUB1 -.LSTRMM_L4x4_SUB4: +STRMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -1188,31 +1187,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b .LSTRMM_L4x4_SUB1 + b STRMM_L4x4_SUB1 -.LSTRMM_L4x4_SUB0: +STRMM_L4x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x4_SUBI1 addic. L, L, -1 - ble .LSTRMM_L4x4_SAVE - b .LSTRMM_L4x4_SUB2 + ble STRMM_L4x4_SAVE + b STRMM_L4x4_SUB2 -.LSTRMM_L4x4_SUB1: +STRMM_L4x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L4x4_SAVE + ble STRMM_L4x4_SAVE -.LSTRMM_L4x4_SUB2: +STRMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt .LSTRMM_L4x4_SUB2 + bgt STRMM_L4x4_SUB2 -.LSTRMM_L4x4_SAVE: +STRMM_L4x4_SAVE: SAVE4x4 @@ -1229,12 +1228,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L4x4_END: +STRMM_L4x4_END: -.LSTRMM_L4x2_BEGIN: +STRMM_L4x2_BEGIN: andi. T1, M, 2 - ble .LSTRMM_L4x2_END + ble STRMM_L4x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1260,11 +1259,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L4x2_SUB0 + ble STRMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L4x2_SUB4 + ble STRMM_L4x2_SUB4 -.LSTRMM_L4x2_LOOP_START: +STRMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -1278,11 +1277,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -2 - ble .LSTRMM_L4x2_LOOP_END + ble STRMM_L4x2_LOOP_END .align 5 -.LSTRMM_L4x2_LOOP: +STRMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -1295,9 +1294,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -1 - bgt .LSTRMM_L4x2_LOOP + bgt STRMM_L4x2_LOOP -.LSTRMM_L4x2_LOOP_END: +STRMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -1309,9 +1308,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_1 KERNEL4x2_E2 - b .LSTRMM_L4x2_SUB1 + b STRMM_L4x2_SUB1 -.LSTRMM_L4x2_SUB4: +STRMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -1323,31 +1322,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b .LSTRMM_L4x2_SUB1 + b STRMM_L4x2_SUB1 -.LSTRMM_L4x2_SUB0: +STRMM_L4x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x2_SUBI1 addic. L, L, -1 - ble .LSTRMM_L4x2_SAVE - b .LSTRMM_L4x2_SUB2 + ble STRMM_L4x2_SAVE + b STRMM_L4x2_SUB2 -.LSTRMM_L4x2_SUB1: +STRMM_L4x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L4x2_SAVE + ble STRMM_L4x2_SAVE -.LSTRMM_L4x2_SUB2: +STRMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt .LSTRMM_L4x2_SUB2 + bgt STRMM_L4x2_SUB2 -.LSTRMM_L4x2_SAVE: +STRMM_L4x2_SAVE: SAVE4x2 @@ -1364,12 +1363,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L4x2_END: +STRMM_L4x2_END: -.LSTRMM_L4x1_BEGIN: +STRMM_L4x1_BEGIN: andi. T1, M, 1 - ble .LSTRMM_L4x1_END + ble STRMM_L4x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1395,11 +1394,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L4x1_SUB0 + ble STRMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L4x1_SUB4 + ble STRMM_L4x1_SUB4 -.LSTRMM_L4x1_LOOP_START: +STRMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -1413,11 +1412,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -2 - ble .LSTRMM_L4x1_LOOP_END + ble STRMM_L4x1_LOOP_END .align 5 -.LSTRMM_L4x1_LOOP: +STRMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -1430,9 +1429,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -1 - bgt .LSTRMM_L4x1_LOOP + bgt STRMM_L4x1_LOOP -.LSTRMM_L4x1_LOOP_END: +STRMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -1444,9 +1443,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_1 KERNEL4x1_E2 - b .LSTRMM_L4x1_SUB1 + b STRMM_L4x1_SUB1 -.LSTRMM_L4x1_SUB4: +STRMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -1458,31 +1457,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b .LSTRMM_L4x1_SUB1 + b STRMM_L4x1_SUB1 -.LSTRMM_L4x1_SUB0: +STRMM_L4x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x1_SUBI1 addic. L, L, -1 - ble .LSTRMM_L4x1_SAVE - b .LSTRMM_L4x1_SUB2 + ble STRMM_L4x1_SAVE + b STRMM_L4x1_SUB2 -.LSTRMM_L4x1_SUB1: +STRMM_L4x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L4x1_SAVE + ble STRMM_L4x1_SAVE -.LSTRMM_L4x1_SUB2: +STRMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt .LSTRMM_L4x1_SUB2 + bgt STRMM_L4x1_SUB2 -.LSTRMM_L4x1_SAVE: +STRMM_L4x1_SAVE: SAVE4x1 @@ -1499,7 +1498,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L4x1_END: +STRMM_L4x1_END: slwi T1, K, 4 add B, B, T1 @@ -1509,11 +1508,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L4_END: -.LSTRMM_L2_BEGIN: +STRMM_L4_END: +STRMM_L2_BEGIN: andi. T1, N, 2 - ble .LSTRMM_L2_END + ble STRMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 @@ -1524,9 +1523,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif srawi. I, M, 4 - ble .LSTRMM_L2x16_END + ble STRMM_L2x16_END -.LSTRMM_L2x16_BEGIN: +STRMM_L2x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1553,11 +1552,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L2x16_SUB0 + ble STRMM_L2x16_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L2x16_SUB4 + ble STRMM_L2x16_SUB4 -.LSTRMM_L2x16_LOOP_START: +STRMM_L2x16_LOOP_START: dcbt AO, PRE LOAD2x16_1 @@ -1580,11 +1579,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -2 - ble .LSTRMM_L2x16_LOOP_END + ble STRMM_L2x16_LOOP_END .align 5 -.LSTRMM_L2x16_LOOP: +STRMM_L2x16_LOOP: dcbt AO, PRE KERNEL2x16_1 @@ -1605,9 +1604,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_2 addic. L, L, -1 - bgt .LSTRMM_L2x16_LOOP + bgt STRMM_L2x16_LOOP -.LSTRMM_L2x16_LOOP_END: +STRMM_L2x16_LOOP_END: dcbt AO, PRE KERNEL2x16_1 @@ -1626,9 +1625,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_1 KERNEL2x16_E2 - b .LSTRMM_L2x16_SUB1 + b STRMM_L2x16_SUB1 -.LSTRMM_L2x16_SUB4: +STRMM_L2x16_SUB4: dcbt AO, PRE KERNEL2x16_SUBI1 @@ -1644,31 +1643,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x16_SUB1 KERNEL2x16_SUB1 - b .LSTRMM_L2x16_SUB1 + b STRMM_L2x16_SUB1 -.LSTRMM_L2x16_SUB0: +STRMM_L2x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x16_SUBI1 addic. L, L, -1 - ble .LSTRMM_L2x16_SAVE - b .LSTRMM_L2x16_SUB2 + ble STRMM_L2x16_SAVE + b STRMM_L2x16_SUB2 -.LSTRMM_L2x16_SUB1: +STRMM_L2x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L2x16_SAVE + ble STRMM_L2x16_SAVE -.LSTRMM_L2x16_SUB2: +STRMM_L2x16_SUB2: KERNEL2x16_SUB1 addic. L, L, -1 - bgt .LSTRMM_L2x16_SUB2 + bgt STRMM_L2x16_SUB2 -.LSTRMM_L2x16_SAVE: +STRMM_L2x16_SAVE: SAVE2x16 @@ -1686,16 +1685,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. I, I, -1 - bgt .LSTRMM_L2x16_BEGIN + bgt STRMM_L2x16_BEGIN -.LSTRMM_L2x16_END: +STRMM_L2x16_END: -.LSTRMM_L2x8_BEGIN: +STRMM_L2x8_BEGIN: andi. T2, M, 15 - ble .LSTRMM_L2x1_END + ble STRMM_L2x1_END andi. T1, M, 8 - ble .LSTRMM_L2x8_END + ble STRMM_L2x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1721,11 +1720,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L2x8_SUB0 + ble STRMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L2x8_SUB4 + ble STRMM_L2x8_SUB4 -.LSTRMM_L2x8_LOOP_START: +STRMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 @@ -1739,11 +1738,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -2 - ble .LSTRMM_L2x8_LOOP_END + ble STRMM_L2x8_LOOP_END .align 5 -.LSTRMM_L2x8_LOOP: +STRMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 @@ -1756,9 +1755,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -1 - bgt .LSTRMM_L2x8_LOOP + bgt STRMM_L2x8_LOOP -.LSTRMM_L2x8_LOOP_END: +STRMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 @@ -1770,9 +1769,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_1 KERNEL2x8_E2 - b .LSTRMM_L2x8_SUB1 + b STRMM_L2x8_SUB1 -.LSTRMM_L2x8_SUB4: +STRMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -1784,31 +1783,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b .LSTRMM_L2x8_SUB1 + b STRMM_L2x8_SUB1 -.LSTRMM_L2x8_SUB0: +STRMM_L2x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x8_SUBI1 addic. L, L, -1 - ble .LSTRMM_L2x8_SAVE - b .LSTRMM_L2x8_SUB2 + ble STRMM_L2x8_SAVE + b STRMM_L2x8_SUB2 -.LSTRMM_L2x8_SUB1: +STRMM_L2x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L2x8_SAVE + ble STRMM_L2x8_SAVE -.LSTRMM_L2x8_SUB2: +STRMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt .LSTRMM_L2x8_SUB2 + bgt STRMM_L2x8_SUB2 -.LSTRMM_L2x8_SAVE: +STRMM_L2x8_SAVE: SAVE2x8 @@ -1825,12 +1824,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L2x8_END: +STRMM_L2x8_END: -.LSTRMM_L2x4_BEGIN: +STRMM_L2x4_BEGIN: andi. T1, M, 4 - ble .LSTRMM_L2x4_END + ble STRMM_L2x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1856,11 +1855,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L2x4_SUB0 + ble STRMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L2x4_SUB4 + ble STRMM_L2x4_SUB4 -.LSTRMM_L2x4_LOOP_START: +STRMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -1874,11 +1873,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -2 - ble .LSTRMM_L2x4_LOOP_END + ble STRMM_L2x4_LOOP_END .align 5 -.LSTRMM_L2x4_LOOP: +STRMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -1891,9 +1890,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -1 - bgt .LSTRMM_L2x4_LOOP + bgt STRMM_L2x4_LOOP -.LSTRMM_L2x4_LOOP_END: +STRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -1905,9 +1904,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_1 KERNEL2x4_E2 - b .LSTRMM_L2x4_SUB1 + b STRMM_L2x4_SUB1 -.LSTRMM_L2x4_SUB4: +STRMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -1919,31 +1918,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b .LSTRMM_L2x4_SUB1 + b STRMM_L2x4_SUB1 -.LSTRMM_L2x4_SUB0: +STRMM_L2x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x4_SUBI1 addic. L, L, -1 - ble .LSTRMM_L2x4_SAVE - b .LSTRMM_L2x4_SUB2 + ble STRMM_L2x4_SAVE + b STRMM_L2x4_SUB2 -.LSTRMM_L2x4_SUB1: +STRMM_L2x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L2x4_SAVE + ble STRMM_L2x4_SAVE -.LSTRMM_L2x4_SUB2: +STRMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt .LSTRMM_L2x4_SUB2 + bgt STRMM_L2x4_SUB2 -.LSTRMM_L2x4_SAVE: +STRMM_L2x4_SAVE: SAVE2x4 @@ -1960,12 +1959,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L2x4_END: +STRMM_L2x4_END: -.LSTRMM_L2x2_BEGIN: +STRMM_L2x2_BEGIN: andi. T1, M, 2 - ble .LSTRMM_L2x2_END + ble STRMM_L2x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1991,11 +1990,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L2x2_SUB0 + ble STRMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L2x2_SUB4 + ble STRMM_L2x2_SUB4 -.LSTRMM_L2x2_LOOP_START: +STRMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -2009,11 +2008,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -2 - ble .LSTRMM_L2x2_LOOP_END + ble STRMM_L2x2_LOOP_END .align 5 -.LSTRMM_L2x2_LOOP: +STRMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -2026,9 +2025,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -1 - bgt .LSTRMM_L2x2_LOOP + bgt STRMM_L2x2_LOOP -.LSTRMM_L2x2_LOOP_END: +STRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -2040,9 +2039,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_1 KERNEL2x2_E2 - b .LSTRMM_L2x2_SUB1 + b STRMM_L2x2_SUB1 -.LSTRMM_L2x2_SUB4: +STRMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -2054,31 +2053,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b .LSTRMM_L2x2_SUB1 + b STRMM_L2x2_SUB1 -.LSTRMM_L2x2_SUB0: +STRMM_L2x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x2_SUBI1 addic. L, L, -1 - ble .LSTRMM_L2x2_SAVE - b .LSTRMM_L2x2_SUB2 + ble STRMM_L2x2_SAVE + b STRMM_L2x2_SUB2 -.LSTRMM_L2x2_SUB1: +STRMM_L2x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L2x2_SAVE + ble STRMM_L2x2_SAVE -.LSTRMM_L2x2_SUB2: +STRMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt .LSTRMM_L2x2_SUB2 + bgt STRMM_L2x2_SUB2 -.LSTRMM_L2x2_SAVE: +STRMM_L2x2_SAVE: SAVE2x2 @@ -2095,12 +2094,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L2x2_END: +STRMM_L2x2_END: -.LSTRMM_L2x1_BEGIN: +STRMM_L2x1_BEGIN: andi. T1, M, 1 - ble .LSTRMM_L2x1_END + ble STRMM_L2x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -2126,11 +2125,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L2x1_SUB0 + ble STRMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L2x1_SUB4 + ble STRMM_L2x1_SUB4 -.LSTRMM_L2x1_LOOP_START: +STRMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -2144,11 +2143,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -2 - ble .LSTRMM_L2x1_LOOP_END + ble STRMM_L2x1_LOOP_END .align 5 -.LSTRMM_L2x1_LOOP: +STRMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -2161,9 +2160,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -1 - bgt .LSTRMM_L2x1_LOOP + bgt STRMM_L2x1_LOOP -.LSTRMM_L2x1_LOOP_END: +STRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -2175,9 +2174,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_1 KERNEL2x1_E2 - b .LSTRMM_L2x1_SUB1 + b STRMM_L2x1_SUB1 -.LSTRMM_L2x1_SUB4: +STRMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -2189,31 +2188,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b .LSTRMM_L2x1_SUB1 + b STRMM_L2x1_SUB1 -.LSTRMM_L2x1_SUB0: +STRMM_L2x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x1_SUBI1 addic. L, L, -1 - ble .LSTRMM_L2x1_SAVE - b .LSTRMM_L2x1_SUB2 + ble STRMM_L2x1_SAVE + b STRMM_L2x1_SUB2 -.LSTRMM_L2x1_SUB1: +STRMM_L2x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L2x1_SAVE + ble STRMM_L2x1_SAVE -.LSTRMM_L2x1_SUB2: +STRMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt .LSTRMM_L2x1_SUB2 + bgt STRMM_L2x1_SUB2 -.LSTRMM_L2x1_SAVE: +STRMM_L2x1_SAVE: SAVE2x1 @@ -2230,7 +2229,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L2x1_END: +STRMM_L2x1_END: slwi T1, K, 3 add B, B, T1 @@ -2240,11 +2239,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L2_END: -.LSTRMM_L1_BEGIN: +STRMM_L2_END: +STRMM_L1_BEGIN: andi. T1, N, 1 - ble .LSTRMM_L1_END + ble STRMM_L1_END mr CO, C mr AO, A @@ -2253,9 +2252,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif srawi. I, M, 4 - ble .LSTRMM_L1x16_END + ble STRMM_L1x16_END -.LSTRMM_L1x16_BEGIN: +STRMM_L1x16_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -2282,11 +2281,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L1x16_SUB0 + ble STRMM_L1x16_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L1x16_SUB4 + ble STRMM_L1x16_SUB4 -.LSTRMM_L1x16_LOOP_START: +STRMM_L1x16_LOOP_START: dcbt AO, PRE LOAD1x16_1 @@ -2309,11 +2308,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -2 - ble .LSTRMM_L1x16_LOOP_END + ble STRMM_L1x16_LOOP_END .align 5 -.LSTRMM_L1x16_LOOP: +STRMM_L1x16_LOOP: dcbt AO, PRE KERNEL1x16_1 @@ -2334,9 +2333,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_2 addic. L, L, -1 - bgt .LSTRMM_L1x16_LOOP + bgt STRMM_L1x16_LOOP -.LSTRMM_L1x16_LOOP_END: +STRMM_L1x16_LOOP_END: dcbt AO, PRE KERNEL1x16_1 @@ -2355,9 +2354,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_1 KERNEL1x16_E2 - b .LSTRMM_L1x16_SUB1 + b STRMM_L1x16_SUB1 -.LSTRMM_L1x16_SUB4: +STRMM_L1x16_SUB4: dcbt AO, PRE KERNEL1x16_SUBI1 @@ -2373,31 +2372,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x16_SUB1 KERNEL1x16_SUB1 - b .LSTRMM_L1x16_SUB1 + b STRMM_L1x16_SUB1 -.LSTRMM_L1x16_SUB0: +STRMM_L1x16_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x16_SUBI1 addic. L, L, -1 - ble .LSTRMM_L1x16_SAVE - b .LSTRMM_L1x16_SUB2 + ble STRMM_L1x16_SAVE + b STRMM_L1x16_SUB2 -.LSTRMM_L1x16_SUB1: +STRMM_L1x16_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L1x16_SAVE + ble STRMM_L1x16_SAVE -.LSTRMM_L1x16_SUB2: +STRMM_L1x16_SUB2: KERNEL1x16_SUB1 addic. L, L, -1 - bgt .LSTRMM_L1x16_SUB2 + bgt STRMM_L1x16_SUB2 -.LSTRMM_L1x16_SAVE: +STRMM_L1x16_SAVE: SAVE1x16 @@ -2415,16 +2414,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. I, I, -1 - bgt .LSTRMM_L1x16_BEGIN + bgt STRMM_L1x16_BEGIN -.LSTRMM_L1x16_END: +STRMM_L1x16_END: -.LSTRMM_L1x8_BEGIN: +STRMM_L1x8_BEGIN: andi. T2, M, 15 - ble .LSTRMM_L1x1_END + ble STRMM_L1x1_END andi. T1, M, 8 - ble .LSTRMM_L1x8_END + ble STRMM_L1x8_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -2450,11 +2449,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L1x8_SUB0 + ble STRMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L1x8_SUB4 + ble STRMM_L1x8_SUB4 -.LSTRMM_L1x8_LOOP_START: +STRMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 @@ -2468,11 +2467,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -2 - ble .LSTRMM_L1x8_LOOP_END + ble STRMM_L1x8_LOOP_END .align 5 -.LSTRMM_L1x8_LOOP: +STRMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 @@ -2485,9 +2484,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -1 - bgt .LSTRMM_L1x8_LOOP + bgt STRMM_L1x8_LOOP -.LSTRMM_L1x8_LOOP_END: +STRMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 @@ -2499,9 +2498,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_1 KERNEL1x8_E2 - b .LSTRMM_L1x8_SUB1 + b STRMM_L1x8_SUB1 -.LSTRMM_L1x8_SUB4: +STRMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -2513,31 +2512,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b .LSTRMM_L1x8_SUB1 + b STRMM_L1x8_SUB1 -.LSTRMM_L1x8_SUB0: +STRMM_L1x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x8_SUBI1 addic. L, L, -1 - ble .LSTRMM_L1x8_SAVE - b .LSTRMM_L1x8_SUB2 + ble STRMM_L1x8_SAVE + b STRMM_L1x8_SUB2 -.LSTRMM_L1x8_SUB1: +STRMM_L1x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L1x8_SAVE + ble STRMM_L1x8_SAVE -.LSTRMM_L1x8_SUB2: +STRMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt .LSTRMM_L1x8_SUB2 + bgt STRMM_L1x8_SUB2 -.LSTRMM_L1x8_SAVE: +STRMM_L1x8_SAVE: SAVE1x8 @@ -2554,12 +2553,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L1x8_END: +STRMM_L1x8_END: -.LSTRMM_L1x4_BEGIN: +STRMM_L1x4_BEGIN: andi. T1, M, 4 - ble .LSTRMM_L1x4_END + ble STRMM_L1x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -2585,11 +2584,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L1x4_SUB0 + ble STRMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L1x4_SUB4 + ble STRMM_L1x4_SUB4 -.LSTRMM_L1x4_LOOP_START: +STRMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -2603,11 +2602,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -2 - ble .LSTRMM_L1x4_LOOP_END + ble STRMM_L1x4_LOOP_END .align 5 -.LSTRMM_L1x4_LOOP: +STRMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -2620,9 +2619,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -1 - bgt .LSTRMM_L1x4_LOOP + bgt STRMM_L1x4_LOOP -.LSTRMM_L1x4_LOOP_END: +STRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -2634,9 +2633,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_1 KERNEL1x4_E2 - b .LSTRMM_L1x4_SUB1 + b STRMM_L1x4_SUB1 -.LSTRMM_L1x4_SUB4: +STRMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -2648,31 +2647,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b .LSTRMM_L1x4_SUB1 + b STRMM_L1x4_SUB1 -.LSTRMM_L1x4_SUB0: +STRMM_L1x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x4_SUBI1 addic. L, L, -1 - ble .LSTRMM_L1x4_SAVE - b .LSTRMM_L1x4_SUB2 + ble STRMM_L1x4_SAVE + b STRMM_L1x4_SUB2 -.LSTRMM_L1x4_SUB1: +STRMM_L1x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L1x4_SAVE + ble STRMM_L1x4_SAVE -.LSTRMM_L1x4_SUB2: +STRMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt .LSTRMM_L1x4_SUB2 + bgt STRMM_L1x4_SUB2 -.LSTRMM_L1x4_SAVE: +STRMM_L1x4_SAVE: SAVE1x4 @@ -2689,12 +2688,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L1x4_END: +STRMM_L1x4_END: -.LSTRMM_L1x2_BEGIN: +STRMM_L1x2_BEGIN: andi. T1, M, 2 - ble .LSTRMM_L1x2_END + ble STRMM_L1x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -2720,11 +2719,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L1x2_SUB0 + ble STRMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L1x2_SUB4 + ble STRMM_L1x2_SUB4 -.LSTRMM_L1x2_LOOP_START: +STRMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -2738,11 +2737,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -2 - ble .LSTRMM_L1x2_LOOP_END + ble STRMM_L1x2_LOOP_END .align 5 -.LSTRMM_L1x2_LOOP: +STRMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -2755,9 +2754,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -1 - bgt .LSTRMM_L1x2_LOOP + bgt STRMM_L1x2_LOOP -.LSTRMM_L1x2_LOOP_END: +STRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -2769,9 +2768,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_1 KERNEL1x2_E2 - b .LSTRMM_L1x2_SUB1 + b STRMM_L1x2_SUB1 -.LSTRMM_L1x2_SUB4: +STRMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -2783,31 +2782,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b .LSTRMM_L1x2_SUB1 + b STRMM_L1x2_SUB1 -.LSTRMM_L1x2_SUB0: +STRMM_L1x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x2_SUBI1 addic. L, L, -1 - ble .LSTRMM_L1x2_SAVE - b .LSTRMM_L1x2_SUB2 + ble STRMM_L1x2_SAVE + b STRMM_L1x2_SUB2 -.LSTRMM_L1x2_SUB1: +STRMM_L1x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L1x2_SAVE + ble STRMM_L1x2_SAVE -.LSTRMM_L1x2_SUB2: +STRMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt .LSTRMM_L1x2_SUB2 + bgt STRMM_L1x2_SUB2 -.LSTRMM_L1x2_SAVE: +STRMM_L1x2_SAVE: SAVE1x2 @@ -2824,12 +2823,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L1x2_END: +STRMM_L1x2_END: -.LSTRMM_L1x1_BEGIN: +STRMM_L1x1_BEGIN: andi. T1, M, 1 - ble .LSTRMM_L1x1_END + ble STRMM_L1x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -2855,11 +2854,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LSTRMM_L1x1_SUB0 + ble STRMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble .LSTRMM_L1x1_SUB4 + ble STRMM_L1x1_SUB4 -.LSTRMM_L1x1_LOOP_START: +STRMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -2873,11 +2872,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -2 - ble .LSTRMM_L1x1_LOOP_END + ble STRMM_L1x1_LOOP_END .align 5 -.LSTRMM_L1x1_LOOP: +STRMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -2890,9 +2889,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -1 - bgt .LSTRMM_L1x1_LOOP + bgt STRMM_L1x1_LOOP -.LSTRMM_L1x1_LOOP_END: +STRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -2904,9 +2903,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_1 KERNEL1x1_E2 - b .LSTRMM_L1x1_SUB1 + b STRMM_L1x1_SUB1 -.LSTRMM_L1x1_SUB4: +STRMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -2918,31 +2917,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b .LSTRMM_L1x1_SUB1 + b STRMM_L1x1_SUB1 -.LSTRMM_L1x1_SUB0: +STRMM_L1x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x1_SUBI1 addic. L, L, -1 - ble .LSTRMM_L1x1_SAVE - b .LSTRMM_L1x1_SUB2 + ble STRMM_L1x1_SAVE + b STRMM_L1x1_SUB2 -.LSTRMM_L1x1_SUB1: +STRMM_L1x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LSTRMM_L1x1_SAVE + ble STRMM_L1x1_SAVE -.LSTRMM_L1x1_SUB2: +STRMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt .LSTRMM_L1x1_SUB2 + bgt STRMM_L1x1_SUB2 -.LSTRMM_L1x1_SAVE: +STRMM_L1x1_SAVE: SAVE1x1 @@ -2959,11 +2958,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LSTRMM_L1x1_END: +STRMM_L1x1_END: #if !defined(LEFT) addi KK, KK, 1 // KK += Number of values in B #endif -.LSTRMM_L1_END: +STRMM_L1_END: diff --git a/kernel/power/strmm_macros_16x8_power8.S b/kernel/power/strmm_macros_16x8_power8.S new file mode 100644 index 000000000..27bc1e89c --- /dev/null +++ b/kernel/power/strmm_macros_16x8_power8.S @@ -0,0 +1,5840 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/02 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +/********************************************************************************************** +* Macros for N=8 and M=16 +**********************************************************************************************/ + +.macro LOAD8x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + xvmulsp vs48, vs0, vs12 + xvmulsp vs49, vs1, vs12 + xvmulsp vs50, vs2, vs12 + xvmulsp vs51, vs3, vs12 + + xvmulsp vs52, vs0, vs13 + xvmulsp vs53, vs1, vs13 + xvmulsp vs54, vs2, vs13 + xvmulsp vs55, vs3, vs13 + + xvmulsp vs56, vs0, vs14 + xvmulsp vs57, vs1, vs14 + xvmulsp vs58, vs2, vs14 + xvmulsp vs59, vs3, vs14 + + xvmulsp vs60, vs0, vs15 + xvmulsp vs61, vs1, vs15 + xvmulsp vs62, vs2, vs15 + xvmulsp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + xvmaddasp vs48, vs0, vs12 + xvmaddasp vs49, vs1, vs12 + xvmaddasp vs50, vs2, vs12 + xvmaddasp vs51, vs3, vs12 + + xvmaddasp vs52, vs0, vs13 + xvmaddasp vs53, vs1, vs13 + xvmaddasp vs54, vs2, vs13 + xvmaddasp vs55, vs3, vs13 + + xvmaddasp vs56, vs0, vs14 + xvmaddasp vs57, vs1, vs14 + xvmaddasp vs58, vs2, vs14 + xvmaddasp vs59, vs3, vs14 + + xvmaddasp vs60, vs0, vs15 + xvmaddasp vs61, vs1, vs15 + xvmaddasp vs62, vs2, vs15 + xvmaddasp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + xvmaddasp vs48, vs4, vs20 + xvmaddasp vs49, vs5, vs20 + xvmaddasp vs50, vs6, vs20 + xvmaddasp vs51, vs7, vs20 + + xvmaddasp vs52, vs4, vs21 + xvmaddasp vs53, vs5, vs21 + xvmaddasp vs54, vs6, vs21 + xvmaddasp vs55, vs7, vs21 + + xvmaddasp vs56, vs4, vs22 + xvmaddasp vs57, vs5, vs22 + xvmaddasp vs58, vs6, vs22 + xvmaddasp vs59, vs7, vs22 + + xvmaddasp vs60, vs4, vs23 + xvmaddasp vs61, vs5, vs23 + xvmaddasp vs62, vs6, vs23 + xvmaddasp vs63, vs7, vs23 + + +.endm + +.macro KERNEL8x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + xvmaddasp vs48, vs4, vs20 + xvmaddasp vs49, vs5, vs20 + xvmaddasp vs50, vs6, vs20 + xvmaddasp vs51, vs7, vs20 + + xvmaddasp vs52, vs4, vs21 + xvmaddasp vs53, vs5, vs21 + xvmaddasp vs54, vs6, vs21 + xvmaddasp vs55, vs7, vs21 + + xvmaddasp vs56, vs4, vs22 + xvmaddasp vs57, vs5, vs22 + xvmaddasp vs58, vs6, vs22 + xvmaddasp vs59, vs7, vs22 + + xvmaddasp vs60, vs4, vs23 + xvmaddasp vs61, vs5, vs23 + xvmaddasp vs62, vs6, vs23 + xvmaddasp vs63, vs7, vs23 + + +.endm + +.macro KERNEL8x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + xvmulsp vs48, vs0, vs12 + xvmulsp vs49, vs1, vs12 + xvmulsp vs50, vs2, vs12 + xvmulsp vs51, vs3, vs12 + + xvmulsp vs52, vs0, vs13 + xvmulsp vs53, vs1, vs13 + xvmulsp vs54, vs2, vs13 + xvmulsp vs55, vs3, vs13 + + xvmulsp vs56, vs0, vs14 + xvmulsp vs57, vs1, vs14 + xvmulsp vs58, vs2, vs14 + xvmulsp vs59, vs3, vs14 + + xvmulsp vs60, vs0, vs15 + xvmulsp vs61, vs1, vs15 + xvmulsp vs62, vs2, vs15 + xvmulsp vs63, vs3, vs15 + + +.endm + +.macro KERNEL8x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + xvmaddasp vs48, vs0, vs12 + xvmaddasp vs49, vs1, vs12 + xvmaddasp vs50, vs2, vs12 + xvmaddasp vs51, vs3, vs12 + + xvmaddasp vs52, vs0, vs13 + xvmaddasp vs53, vs1, vs13 + xvmaddasp vs54, vs2, vs13 + xvmaddasp vs55, vs3, vs13 + + xvmaddasp vs56, vs0, vs14 + xvmaddasp vs57, vs1, vs14 + xvmaddasp vs58, vs2, vs14 + xvmaddasp vs59, vs3, vs14 + + xvmaddasp vs60, vs0, vs15 + xvmaddasp vs61, vs1, vs15 + xvmaddasp vs62, vs2, vs15 + xvmaddasp vs63, vs3, vs15 + + +.endm + +.macro SAVE8x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr +#else + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr +#else + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs48, alpha_vr + xvmulsp vs1, vs49, alpha_vr + xvmulsp vs2, vs50, alpha_vr + xvmulsp vs3, vs51, alpha_vr +#else + xvmaddasp vs0, vs48, alpha_vr + xvmaddasp vs1, vs49, alpha_vr + xvmaddasp vs2, vs50, alpha_vr + xvmaddasp vs3, vs51, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs52, alpha_vr + xvmulsp vs1, vs53, alpha_vr + xvmulsp vs2, vs54, alpha_vr + xvmulsp vs3, vs55, alpha_vr +#else + xvmaddasp vs0, vs52, alpha_vr + xvmaddasp vs1, vs53, alpha_vr + xvmaddasp vs2, vs54, alpha_vr + xvmaddasp vs3, vs55, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs56, alpha_vr + xvmulsp vs1, vs57, alpha_vr + xvmulsp vs2, vs58, alpha_vr + xvmulsp vs3, vs59, alpha_vr +#else + xvmaddasp vs0, vs56, alpha_vr + xvmaddasp vs1, vs57, alpha_vr + xvmaddasp vs2, vs58, alpha_vr + xvmaddasp vs3, vs59, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs60, alpha_vr + xvmulsp vs1, vs61, alpha_vr + xvmulsp vs2, vs62, alpha_vr + xvmulsp vs3, vs63, alpha_vr +#else + xvmaddasp vs0, vs60, alpha_vr + xvmaddasp vs1, vs61, alpha_vr + xvmaddasp vs2, vs62, alpha_vr + xvmaddasp vs3, vs63, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=8 +**********************************************************************************************/ + +.macro LOAD8x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + xvmulsp vs40, vs0, vs12 + xvmulsp vs41, vs1, vs12 + + xvmulsp vs42, vs0, vs13 + xvmulsp vs43, vs1, vs13 + + xvmulsp vs44, vs0, vs14 + xvmulsp vs45, vs1, vs14 + + xvmulsp vs46, vs0, vs15 + xvmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + xvmaddasp vs40, vs0, vs12 + xvmaddasp vs41, vs1, vs12 + + xvmaddasp vs42, vs0, vs13 + xvmaddasp vs43, vs1, vs13 + + xvmaddasp vs44, vs0, vs14 + xvmaddasp vs45, vs1, vs14 + + xvmaddasp vs46, vs0, vs15 + xvmaddasp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + xvmaddasp vs40, vs4, vs20 + xvmaddasp vs41, vs5, vs20 + + xvmaddasp vs42, vs4, vs21 + xvmaddasp vs43, vs5, vs21 + + xvmaddasp vs44, vs4, vs22 + xvmaddasp vs45, vs5, vs22 + + xvmaddasp vs46, vs4, vs23 + xvmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + xvmaddasp vs40, vs4, vs20 + xvmaddasp vs41, vs5, vs20 + + xvmaddasp vs42, vs4, vs21 + xvmaddasp vs43, vs5, vs21 + + xvmaddasp vs44, vs4, vs22 + xvmaddasp vs45, vs5, vs22 + + xvmaddasp vs46, vs4, vs23 + xvmaddasp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + xvmulsp vs40, vs0, vs12 + xvmulsp vs41, vs1, vs12 + + xvmulsp vs42, vs0, vs13 + xvmulsp vs43, vs1, vs13 + + xvmulsp vs44, vs0, vs14 + xvmulsp vs45, vs1, vs14 + + xvmulsp vs46, vs0, vs15 + xvmulsp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + xvmaddasp vs40, vs0, vs12 + xvmaddasp vs41, vs1, vs12 + + xvmaddasp vs42, vs0, vs13 + xvmaddasp vs43, vs1, vs13 + + xvmaddasp vs44, vs0, vs14 + xvmaddasp vs45, vs1, vs14 + + xvmaddasp vs46, vs0, vs15 + xvmaddasp vs47, vs1, vs15 + + +.endm + +.macro SAVE8x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr +#else + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr +#else + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs42, alpha_vr + xvmulsp vs1, vs43, alpha_vr +#else + xvmaddasp vs0, vs42, alpha_vr + xvmaddasp vs1, vs43, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr +#else + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs46, alpha_vr + xvmulsp vs1, vs47, alpha_vr +#else + xvmaddasp vs0, vs46, alpha_vr + xvmaddasp vs1, vs47, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=4 +**********************************************************************************************/ + +.macro LOAD8x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + xvmulsp vs36, vs0, vs12 + + xvmulsp vs37, vs0, vs13 + + xvmulsp vs38, vs0, vs14 + + xvmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs20, vs29, 0 + xxspltw vs21, vs29, 1 + xxspltw vs22, vs29, 2 + xxspltw vs23, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + xvmaddasp vs36, vs0, vs12 + + xvmaddasp vs37, vs0, vs13 + + xvmaddasp vs38, vs0, vs14 + + xvmaddasp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + xvmaddasp vs36, vs4, vs20 + + xvmaddasp vs37, vs4, vs21 + + xvmaddasp vs38, vs4, vs22 + + xvmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + xvmaddasp vs36, vs4, vs20 + + xvmaddasp vs37, vs4, vs21 + + xvmaddasp vs38, vs4, vs22 + + xvmaddasp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + xvmulsp vs36, vs0, vs12 + + xvmulsp vs37, vs0, vs13 + + xvmulsp vs38, vs0, vs14 + + xvmulsp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + lxvw4x vs29, o16, BO + + xxspltw vs12, vs29, 0 + xxspltw vs13, vs29, 1 + xxspltw vs14, vs29, 2 + xxspltw vs15, vs29, 3 + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + xvmaddasp vs36, vs0, vs12 + + xvmaddasp vs37, vs0, vs13 + + xvmaddasp vs38, vs0, vs14 + + xvmaddasp vs39, vs0, vs15 + + +.endm + +.macro SAVE8x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs33, alpha_vr +#else + xvmaddasp vs0, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs35, alpha_vr +#else + xvmaddasp vs0, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs37, alpha_vr +#else + xvmaddasp vs0, vs37, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs38, alpha_vr +#else + xvmaddasp vs0, vs38, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs39, alpha_vr +#else + xvmaddasp vs0, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=2 +**********************************************************************************************/ + +.macro LOAD8x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + xsmuldp vs40, vs0, vs12 + xsmuldp vs41, vs1, vs12 + + xsmuldp vs42, vs0, vs13 + xsmuldp vs43, vs1, vs13 + + xsmuldp vs44, vs0, vs14 + xsmuldp vs45, vs1, vs14 + + xsmuldp vs46, vs0, vs15 + xsmuldp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + xsmaddadp vs40, vs0, vs12 + xsmaddadp vs41, vs1, vs12 + + xsmaddadp vs42, vs0, vs13 + xsmaddadp vs43, vs1, vs13 + + xsmaddadp vs44, vs0, vs14 + xsmaddadp vs45, vs1, vs14 + + xsmaddadp vs46, vs0, vs15 + xsmaddadp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + xsmaddadp vs40, vs4, vs20 + xsmaddadp vs41, vs5, vs20 + + xsmaddadp vs42, vs4, vs21 + xsmaddadp vs43, vs5, vs21 + + xsmaddadp vs44, vs4, vs22 + xsmaddadp vs45, vs5, vs22 + + xsmaddadp vs46, vs4, vs23 + xsmaddadp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + xsmaddadp vs40, vs4, vs20 + xsmaddadp vs41, vs5, vs20 + + xsmaddadp vs42, vs4, vs21 + xsmaddadp vs43, vs5, vs21 + + xsmaddadp vs44, vs4, vs22 + xsmaddadp vs45, vs5, vs22 + + xsmaddadp vs46, vs4, vs23 + xsmaddadp vs47, vs5, vs23 + + +.endm + +.macro KERNEL8x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + xsmuldp vs40, vs0, vs12 + xsmuldp vs41, vs1, vs12 + + xsmuldp vs42, vs0, vs13 + xsmuldp vs43, vs1, vs13 + + xsmuldp vs44, vs0, vs14 + xsmuldp vs45, vs1, vs14 + + xsmuldp vs46, vs0, vs15 + xsmuldp vs47, vs1, vs15 + + +.endm + +.macro KERNEL8x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + xsmaddadp vs40, vs0, vs12 + xsmaddadp vs41, vs1, vs12 + + xsmaddadp vs42, vs0, vs13 + xsmaddadp vs43, vs1, vs13 + + xsmaddadp vs44, vs0, vs14 + xsmaddadp vs45, vs1, vs14 + + xsmaddadp vs46, vs0, vs15 + xsmaddadp vs47, vs1, vs15 + + +.endm + +.macro SAVE8x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs36, alpha_r + xsmuldp vs1, vs37, alpha_r +#else + xsmaddadp vs0, vs36, alpha_r + xsmaddadp vs1, vs37, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs38, alpha_r + xsmuldp vs1, vs39, alpha_r +#else + xsmaddadp vs0, vs38, alpha_r + xsmaddadp vs1, vs39, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs40, alpha_r + xsmuldp vs1, vs41, alpha_r +#else + xsmaddadp vs0, vs40, alpha_r + xsmaddadp vs1, vs41, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs42, alpha_r + xsmuldp vs1, vs43, alpha_r +#else + xsmaddadp vs0, vs42, alpha_r + xsmaddadp vs1, vs43, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs44, alpha_r + xsmuldp vs1, vs45, alpha_r +#else + xsmaddadp vs0, vs44, alpha_r + xsmaddadp vs1, vs45, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs46, alpha_r + xsmuldp vs1, vs47, alpha_r +#else + xsmaddadp vs0, vs46, alpha_r + xsmaddadp vs1, vs47, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=8 and M=1 +**********************************************************************************************/ + +.macro LOAD8x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + +.endm + +.macro KERNEL8x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + xsmuldp vs36, vs0, vs12 + + xsmuldp vs37, vs0, vs13 + + xsmuldp vs38, vs0, vs14 + + xsmuldp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi T1, T1, 16 + + lxsspx vs20, o0, T1 + lxsspx vs21, o4, T1 + lxsspx vs22, o8, T1 + lxsspx vs23, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + xsmaddadp vs36, vs0, vs12 + + xsmaddadp vs37, vs0, vs13 + + xsmaddadp vs38, vs0, vs14 + + xsmaddadp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + xsmaddadp vs36, vs4, vs20 + + xsmaddadp vs37, vs4, vs21 + + xsmaddadp vs38, vs4, vs22 + + xsmaddadp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + xsmaddadp vs36, vs4, vs20 + + xsmaddadp vs37, vs4, vs21 + + xsmaddadp vs38, vs4, vs22 + + xsmaddadp vs39, vs4, vs23 + + +.endm + +.macro KERNEL8x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + xsmuldp vs36, vs0, vs12 + + xsmuldp vs37, vs0, vs13 + + xsmuldp vs38, vs0, vs14 + + xsmuldp vs39, vs0, vs15 + + +.endm + +.macro KERNEL8x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi T1, T1, 16 + + lxsspx vs12, o0, T1 + lxsspx vs13, o4, T1 + lxsspx vs14, o8, T1 + lxsspx vs15, o12, T1 + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + xsmaddadp vs36, vs0, vs12 + + xsmaddadp vs37, vs0, vs13 + + xsmaddadp vs38, vs0, vs14 + + xsmaddadp vs39, vs0, vs15 + + +.endm + +.macro SAVE8x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs33, alpha_r +#else + xsmaddadp vs0, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs35, alpha_r +#else + xsmaddadp vs0, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs36, alpha_r +#else + xsmaddadp vs0, vs36, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs37, alpha_r +#else + xsmaddadp vs0, vs37, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs38, alpha_r +#else + xsmaddadp vs0, vs38, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs39, alpha_r +#else + xsmaddadp vs0, vs39, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=16 +**********************************************************************************************/ + +.macro LOAD4x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + +.endm + +.macro KERNEL4x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + xvmaddasp vs40, vs4, vs18 + xvmaddasp vs41, vs5, vs18 + xvmaddasp vs42, vs6, vs18 + xvmaddasp vs43, vs7, vs18 + + xvmaddasp vs44, vs4, vs19 + xvmaddasp vs45, vs5, vs19 + xvmaddasp vs46, vs6, vs19 + xvmaddasp vs47, vs7, vs19 + + +.endm + +.macro KERNEL4x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + xvmulsp vs40, vs0, vs10 + xvmulsp vs41, vs1, vs10 + xvmulsp vs42, vs2, vs10 + xvmulsp vs43, vs3, vs10 + + xvmulsp vs44, vs0, vs11 + xvmulsp vs45, vs1, vs11 + xvmulsp vs46, vs2, vs11 + xvmulsp vs47, vs3, vs11 + + +.endm + +.macro KERNEL4x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + xvmaddasp vs40, vs0, vs10 + xvmaddasp vs41, vs1, vs10 + xvmaddasp vs42, vs2, vs10 + xvmaddasp vs43, vs3, vs10 + + xvmaddasp vs44, vs0, vs11 + xvmaddasp vs45, vs1, vs11 + xvmaddasp vs46, vs2, vs11 + xvmaddasp vs47, vs3, vs11 + + +.endm + +.macro SAVE4x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs40, alpha_vr + xvmulsp vs1, vs41, alpha_vr + xvmulsp vs2, vs42, alpha_vr + xvmulsp vs3, vs43, alpha_vr +#else + xvmaddasp vs0, vs40, alpha_vr + xvmaddasp vs1, vs41, alpha_vr + xvmaddasp vs2, vs42, alpha_vr + xvmaddasp vs3, vs43, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs44, alpha_vr + xvmulsp vs1, vs45, alpha_vr + xvmulsp vs2, vs46, alpha_vr + xvmulsp vs3, vs47, alpha_vr +#else + xvmaddasp vs0, vs44, alpha_vr + xvmaddasp vs1, vs45, alpha_vr + xvmaddasp vs2, vs46, alpha_vr + xvmaddasp vs3, vs47, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + xvmaddasp vs36, vs4, vs18 + xvmaddasp vs37, vs5, vs18 + + xvmaddasp vs38, vs4, vs19 + xvmaddasp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + xvmulsp vs36, vs0, vs10 + xvmulsp vs37, vs1, vs10 + + xvmulsp vs38, vs0, vs11 + xvmulsp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + xvmaddasp vs36, vs0, vs10 + xvmaddasp vs37, vs1, vs10 + + xvmaddasp vs38, vs0, vs11 + xvmaddasp vs39, vs1, vs11 + + +.endm + +.macro SAVE4x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs38, alpha_vr + xvmulsp vs1, vs39, alpha_vr +#else + xvmaddasp vs0, vs38, alpha_vr + xvmaddasp vs1, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + xxspltw vs18, vs28, 2 + xxspltw vs19, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + xvmaddasp vs34, vs4, vs18 + + xvmaddasp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + xvmulsp vs34, vs0, vs10 + + xvmulsp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + xxspltw vs10, vs28, 2 + xxspltw vs11, vs28, 3 + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + xvmaddasp vs34, vs0, vs10 + + xvmaddasp vs35, vs0, vs11 + + +.endm + +.macro SAVE4x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs33, alpha_vr +#else + xvmaddasp vs0, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs35, alpha_vr +#else + xvmaddasp vs0, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro LOAD4x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + xsmaddadp vs36, vs4, vs18 + xsmaddadp vs37, vs5, vs18 + + xsmaddadp vs38, vs4, vs19 + xsmaddadp vs39, vs5, vs19 + + +.endm + +.macro KERNEL4x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + xsmuldp vs36, vs0, vs10 + xsmuldp vs37, vs1, vs10 + + xsmuldp vs38, vs0, vs11 + xsmuldp vs39, vs1, vs11 + + +.endm + +.macro KERNEL4x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + xsmaddadp vs36, vs0, vs10 + xsmaddadp vs37, vs1, vs10 + + xsmaddadp vs38, vs0, vs11 + xsmaddadp vs39, vs1, vs11 + + +.endm + +.macro SAVE4x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs36, alpha_r + xsmuldp vs1, vs37, alpha_r +#else + xsmaddadp vs0, vs36, alpha_r + xsmaddadp vs1, vs37, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs38, alpha_r + xsmuldp vs1, vs39, alpha_r +#else + xsmaddadp vs0, vs38, alpha_r + xsmaddadp vs1, vs39, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro LOAD4x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + +.endm + +.macro KERNEL4x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + lxsspx vs18, o8, T1 + lxsspx vs19, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + xsmaddadp vs34, vs4, vs18 + + xsmaddadp vs35, vs4, vs19 + + +.endm + +.macro KERNEL4x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + xsmuldp vs34, vs0, vs10 + + xsmuldp vs35, vs0, vs11 + + +.endm + +.macro KERNEL4x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + lxsspx vs10, o8, T1 + lxsspx vs11, o12, T1 + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + xsmaddadp vs34, vs0, vs10 + + xsmaddadp vs35, vs0, vs11 + + +.endm + +.macro SAVE4x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs33, alpha_r +#else + xsmaddadp vs0, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs35, alpha_r +#else + xsmaddadp vs0, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=16 +**********************************************************************************************/ + +.macro LOAD2x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + +.endm + +.macro KERNEL2x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + xvmaddasp vs36, vs4, vs17 + xvmaddasp vs37, vs5, vs17 + xvmaddasp vs38, vs6, vs17 + xvmaddasp vs39, vs7, vs17 + + +.endm + +.macro KERNEL2x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + xvmulsp vs36, vs0, vs9 + xvmulsp vs37, vs1, vs9 + xvmulsp vs38, vs2, vs9 + xvmulsp vs39, vs3, vs9 + + +.endm + +.macro KERNEL2x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + xvmaddasp vs36, vs0, vs9 + xvmaddasp vs37, vs1, vs9 + xvmaddasp vs38, vs2, vs9 + xvmaddasp vs39, vs3, vs9 + + +.endm + +.macro SAVE2x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs36, alpha_vr + xvmulsp vs1, vs37, alpha_vr + xvmulsp vs2, vs38, alpha_vr + xvmulsp vs3, vs39, alpha_vr +#else + xvmaddasp vs0, vs36, alpha_vr + xvmaddasp vs1, vs37, alpha_vr + xvmaddasp vs2, vs38, alpha_vr + xvmaddasp vs3, vs39, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + xvmaddasp vs34, vs4, vs17 + xvmaddasp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + xvmulsp vs34, vs0, vs9 + xvmulsp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + xvmaddasp vs34, vs0, vs9 + xvmaddasp vs35, vs1, vs9 + + +.endm + +.macro SAVE2x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs34, alpha_vr + xvmulsp vs1, vs35, alpha_vr +#else + xvmaddasp vs0, vs34, alpha_vr + xvmaddasp vs1, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + xxspltw vs17, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + xvmaddasp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 + + xvmulsp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + xxspltw vs9, vs28, 1 + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 + + xvmaddasp vs33, vs0, vs9 + + +.endm + +.macro SAVE2x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs33, alpha_vr +#else + xvmaddasp vs0, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + xsmaddadp vs34, vs4, vs17 + xsmaddadp vs35, vs5, vs17 + + +.endm + +.macro KERNEL2x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + xsmuldp vs34, vs0, vs9 + xsmuldp vs35, vs1, vs9 + + +.endm + +.macro KERNEL2x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + xsmaddadp vs34, vs0, vs9 + xsmaddadp vs35, vs1, vs9 + + +.endm + +.macro SAVE2x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs34, alpha_r + xsmuldp vs1, vs35, alpha_r +#else + xsmaddadp vs0, vs34, alpha_r + xsmaddadp vs1, vs35, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + +.endm + +.macro KERNEL2x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + lxsspx vs17, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + xsmaddadp vs33, vs4, vs17 + + +.endm + +.macro KERNEL2x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 + + xsmuldp vs33, vs0, vs9 + + +.endm + +.macro KERNEL2x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + lxsspx vs9, o4, T1 + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 + + xsmaddadp vs33, vs0, vs9 + + +.endm + +.macro SAVE2x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs33, alpha_r +#else + xsmaddadp vs0, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=16 +**********************************************************************************************/ + +.macro LOAD1x16_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x16_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + lxvw4x vs6, o32, AO + lxvw4x vs7, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + +.endm + +.macro KERNEL1x16_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + xvmaddasp vs34, vs6, vs16 + xvmaddasp vs35, vs7, vs16 + + +.endm + +.macro KERNEL1x16_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + xvmulsp vs34, vs2, vs8 + xvmulsp vs35, vs3, vs8 + + +.endm + +.macro KERNEL1x16_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + lxvw4x vs2, o32, AO + lxvw4x vs3, o48, AO + + addi AO, AO, 64 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + xvmaddasp vs34, vs2, vs8 + xvmaddasp vs35, vs3, vs8 + + +.endm + +.macro SAVE1x16 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + lxvw4x vs2, o32, T1 + lxvw4x vs3, o48, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr + xvmulsp vs2, vs34, alpha_vr + xvmulsp vs3, vs35, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr + xvmaddasp vs2, vs34, alpha_vr + xvmaddasp vs3, vs35, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + stxvw4x vs2, o32, T1 + stxvw4x vs3, o48, T1 + + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x8_I1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_1 + + + lxvw4x vs4, o0, AO + lxvw4x vs5, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_2 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddasp vs32, vs4, vs16 + xvmaddasp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x8_SUBI1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + xvmulsp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x8_SUB1 + + + lxvw4x vs0, o0, AO + lxvw4x vs1, o16, AO + + addi AO, AO, 32 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + xvmaddasp vs33, vs1, vs8 + + +.endm + +.macro SAVE1x8 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + lxvw4x vs1, o16, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr + xvmulsp vs1, vs33, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr + xvmaddasp vs1, vs33, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + stxvw4x vs1, o16, T1 + + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x4_I1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_1 + + + lxvw4x vs4, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs16, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_2 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddasp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x4_SUBI1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmulsp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x4_SUB1 + + + lxvw4x vs0, o0, AO + + addi AO, AO, 16 + + lxvw4x vs28, o0, BO + + xxspltw vs8, vs28, 0 + + addi BO, BO, 4 + + + xvmaddasp vs32, vs0, vs8 + + +.endm + +.macro SAVE1x4 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvw4x vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xvmulsp vs0, vs32, alpha_vr +#else + xvmaddasp vs0, vs32, alpha_vr +#endif + + stxvw4x vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x2_I1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_1 + + + lxsspx vs4, o0, AO + lxsspx vs5, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_2 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x2_E2 + + + xsmaddadp vs32, vs4, vs16 + xsmaddadp vs33, vs5, vs16 + + +.endm + +.macro KERNEL1x2_SUBI1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmuldp vs32, vs0, vs8 + xsmuldp vs33, vs1, vs8 + + +.endm + +.macro KERNEL1x2_SUB1 + + + lxsspx vs0, o0, AO + lxsspx vs1, o4, AO + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs0, vs8 + xsmaddadp vs33, vs1, vs8 + + +.endm + +.macro SAVE1x2 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + lxsspx vs1, o4, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r + xsmuldp vs1, vs33, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r + xsmaddadp vs1, vs33, alpha_r +#endif + + stxsspx vs0, o0, T1 + stxsspx vs1, o4, T1 + + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + +.endm + +.macro KERNEL1x1_I1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmuldp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_1 + + + lxsspx vs4, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs16, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_2 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs4, vs16 + + +.endm + +.macro KERNEL1x1_SUBI1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmuldp vs32, vs0, vs8 + + +.endm + +.macro KERNEL1x1_SUB1 + + + lxsspx vs0, o0, AO + + addi AO, AO, 4 + + mr T1, BO + + lxsspx vs8, o0, T1 + + addi BO, BO, 4 + + + xsmaddadp vs32, vs0, vs8 + + +.endm + +.macro SAVE1x1 + + mr T1, CO + +#ifndef TRMMKERNEL + + lxsspx vs0, o0, T1 + +#endif + +#ifdef TRMMKERNEL + xsmuldp vs0, vs32, alpha_r +#else + xsmaddadp vs0, vs32, alpha_r +#endif + + stxsspx vs0, o0, T1 + + add T1, T1, LDC + + addi CO, CO, 4 + +.endm + diff --git a/param.h b/param.h index 370d10b9a..fb344cd33 100644 --- a/param.h +++ b/param.h @@ -1964,7 +1964,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SNUMOPT 16 #define DNUMOPT 8 -#define GEMM_DEFAULT_OFFSET_A 384 +#define GEMM_DEFAULT_OFFSET_A 131072 #define GEMM_DEFAULT_OFFSET_B 1024 #define GEMM_DEFAULT_ALIGN 0x03fffUL @@ -1977,17 +1977,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 8 #define ZGEMM_DEFAULT_UNROLL_N 2 -#define SGEMM_DEFAULT_P 480 +#define SGEMM_DEFAULT_P 960 #define DGEMM_DEFAULT_P 480 #define CGEMM_DEFAULT_P 480 #define ZGEMM_DEFAULT_P 240 -#define SGEMM_DEFAULT_Q 1440 +#define SGEMM_DEFAULT_Q 720 #define DGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 720 #define ZGEMM_DEFAULT_Q 360 -#define SGEMM_DEFAULT_R 28800 +#define SGEMM_DEFAULT_R 14400 #define DGEMM_DEFAULT_R 14400 #define CGEMM_DEFAULT_R 14400 #define ZGEMM_DEFAULT_R 7200 From 12540cedb5eb3a1160623fbbc9c9f67572a92e5f Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 3 Apr 2016 07:21:48 +0200 Subject: [PATCH 38/48] added ESSL to Makefile for benchmarks --- benchmark/Makefile | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 4692c640e..badd42c6b 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -33,6 +33,9 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread # Apple vecLib LIBVECLIB = -framework Accelerate +ESSL=/opt/ibm/lib +LIBESSL = -lessl $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.2/lib/libxl.a + ifeq ($(OSNAME), WINNT) goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ @@ -255,7 +258,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ endif - +essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ + cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ @@ -443,6 +447,9 @@ sgemm.mkl : sgemm.$(SUFFIX) sgemm.veclib : sgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +sgemm.essl : sgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dgemm #################################################### dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -459,6 +466,9 @@ dgemm.mkl : dgemm.$(SUFFIX) dgemm.veclib : dgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dgemm.essl : dgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Cgemm #################################################### cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) @@ -476,6 +486,9 @@ cgemm.mkl : cgemm.$(SUFFIX) cgemm.veclib : cgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +cgemm.essl : cgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zgemm #################################################### zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) @@ -493,6 +506,9 @@ zgemm.mkl : zgemm.$(SUFFIX) zgemm.veclib : zgemm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zgemm.essl : zgemm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ssymm #################################################### ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -575,6 +591,9 @@ strmm.mkl : strmm.$(SUFFIX) strmm.veclib : strmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +strmm.essl : strmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dtrmm #################################################### dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -591,6 +610,9 @@ dtrmm.mkl : dtrmm.$(SUFFIX) dtrmm.veclib : dtrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dtrmm.essl : dtrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ctrmm #################################################### ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) @@ -608,6 +630,9 @@ ctrmm.mkl : ctrmm.$(SUFFIX) ctrmm.veclib : ctrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ctrmm.essl : ctrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Ztrmm #################################################### ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) @@ -625,6 +650,9 @@ ztrmm.mkl : ztrmm.$(SUFFIX) ztrmm.veclib : ztrmm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +ztrmm.essl : ztrmm.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Strsm #################################################### strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2179,7 +2207,7 @@ smallscaling: smallscaling.c ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm clean :: - @rm -f *.goto *.mkl *.acml *.atlas *.veclib + @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl include $(TOPDIR)/Makefile.tail From d4c0330967f13ce916da41391bc1ccf383c34b5b Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 3 Apr 2016 14:30:49 +0200 Subject: [PATCH 39/48] updated cgemm- and ctrmm-kernel for POWER8 --- kernel/power/cgemm_kernel_8x4_power8.S | 46 +- kernel/power/cgemm_logic_8x4_power8.S | 558 ++-- kernel/power/cgemm_macros_8x4_power8.S | 4075 ++++++++++++------------ kernel/power/ctrmm_kernel_8x4_power8.S | 44 +- kernel/power/ctrmm_logic_8x4_power8.S | 555 ++-- 5 files changed, 2697 insertions(+), 2581 deletions(-) diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S index f732c8132..a7e706699 100644 --- a/kernel/power/cgemm_kernel_8x4_power8.S +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/03 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ @@ -130,10 +130,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define o0 0 -#define alpha_r vs30 -#define alpha_i vs31 -#define TBUFFER r14 +#define alpha_dr vs28 +#define alpha_di vs29 +#define alpha_sr vs30 +#define alpha_si vs31 + + +#define NOTUSED r14 #define L r15 #define o12 r16 #define o4 r17 @@ -271,21 +275,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cgemm_macros_8x4_power8.S" cmpwi cr0, M, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, N, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, K, 0 - ble .L999_H1 + ble L999_H1 slwi LDC, LDC, ZBASE_SHIFT - li PRE, 256 + li PRE, 384 li o4 , 4 li o8 , 8 li o12 , 12 li o16 , 16 li o32 , 32 li o48 , 48 - addi TBUFFER, SP, 360 #ifdef __64BIT__ @@ -294,14 +297,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi T1 , SP, 224 #endif - lxsspx alpha_r, 0, T1 - lxsspx alpha_i, o8, T1 + stxsspx vs1, 0, T1 + lxsspx alpha_dr, 0, T1 + stxsspx vs2, o8 , T1 + lxsspx alpha_di, o8, T1 + addi T1, SP, 360 + li T2, 0 + + stw T2, 0(T1) + stw T2, 4(T1) + stw T2, 8(T1) + stxsspx alpha_dr, o12, T1 + lxvw4x alpha_sr, o0 , T1 + addi T1, T1, 16 + + stw T2, 0(T1) + stw T2, 4(T1) + stw T2, 8(T1) + stxsspx alpha_di, o12, T1 + lxvw4x alpha_si, o0 , T1 .align 5 #include "cgemm_logic_8x4_power8.S" -.L999: +L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/cgemm_logic_8x4_power8.S b/kernel/power/cgemm_logic_8x4_power8.S index 51a063126..851a09aaa 100644 --- a/kernel/power/cgemm_logic_8x4_power8.S +++ b/kernel/power/cgemm_logic_8x4_power8.S @@ -26,38 +26,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/03 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ - srawi. J, N, 2 - ble .LCGEMM_L4_END + ble CGEMM_L4_END -.LCGEMM_L4_BEGIN: +CGEMM_L4_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 2 add C, C, T1 srawi. I, M, 3 - ble .LCGEMM_L4x8_END + ble CGEMM_L4x8_END -.LCGEMM_L4x8_BEGIN: +CGEMM_L4x8_BEGIN: mr BO, B srawi. L, K, 3 - ble .LCGEMM_L4x8_SUB0 + ble CGEMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L4x8_SUB4 + ble CGEMM_L4x8_SUB4 -.LCGEMM_L4x8_LOOP_START: +CGEMM_L4x8_LOOP_START: dcbt AO, PRE + dcbt BO, PRE LOAD4x8_1 KERNEL4x8_I1 dcbt AO, PRE @@ -68,17 +68,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 addic. L, L, -2 - ble .LCGEMM_L4x8_LOOP_END + ble CGEMM_L4x8_LOOP_END .align 5 -.LCGEMM_L4x8_LOOP: +CGEMM_L4x8_LOOP: KERNEL4x8_1 dcbt AO, PRE @@ -89,15 +90,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL4x8_2 KERNEL4x8_1 dcbt AO, PRE KERNEL4x8_2 addic. L, L, -1 - bgt .LCGEMM_L4x8_LOOP + bgt CGEMM_L4x8_LOOP -.LCGEMM_L4x8_LOOP_END: +CGEMM_L4x8_LOOP_END: KERNEL4x8_1 dcbt AO, PRE @@ -112,9 +114,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 KERNEL4x8_E2 - b .LCGEMM_L4x8_SUB1 + b CGEMM_L4x8_SUB1 -.LCGEMM_L4x8_SUB4: +CGEMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -126,53 +128,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b .LCGEMM_L4x8_SUB1 + b CGEMM_L4x8_SUB1 -.LCGEMM_L4x8_SUB0: +CGEMM_L4x8_SUB0: andi. L, K, 7 KERNEL4x8_SUBI1 addic. L, L, -1 - ble .LCGEMM_L4x8_SAVE - b .LCGEMM_L4x8_SUB2 + ble CGEMM_L4x8_SAVE + b CGEMM_L4x8_SUB2 -.LCGEMM_L4x8_SUB1: +CGEMM_L4x8_SUB1: andi. L, K, 7 - ble .LCGEMM_L4x8_SAVE + ble CGEMM_L4x8_SAVE -.LCGEMM_L4x8_SUB2: +CGEMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt .LCGEMM_L4x8_SUB2 + bgt CGEMM_L4x8_SUB2 -.LCGEMM_L4x8_SAVE: +CGEMM_L4x8_SAVE: SAVE4x8 addic. I, I, -1 - bgt .LCGEMM_L4x8_BEGIN + bgt CGEMM_L4x8_BEGIN -.LCGEMM_L4x8_END: +CGEMM_L4x8_END: -.LCGEMM_L4x4_BEGIN: +CGEMM_L4x4_BEGIN: andi. T2, M, 7 - ble .LCGEMM_L4x1_END + ble CGEMM_L4x1_END andi. T1, M, 4 - ble .LCGEMM_L4x4_END + ble CGEMM_L4x4_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L4x4_SUB0 + ble CGEMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L4x4_SUB4 + ble CGEMM_L4x4_SUB4 -.LCGEMM_L4x4_LOOP_START: +CGEMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 @@ -186,11 +188,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -2 - ble .LCGEMM_L4x4_LOOP_END + ble CGEMM_L4x4_LOOP_END .align 5 -.LCGEMM_L4x4_LOOP: +CGEMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 @@ -203,9 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -1 - bgt .LCGEMM_L4x4_LOOP + bgt CGEMM_L4x4_LOOP -.LCGEMM_L4x4_LOOP_END: +CGEMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -217,9 +219,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_1 KERNEL4x4_E2 - b .LCGEMM_L4x4_SUB1 + b CGEMM_L4x4_SUB1 -.LCGEMM_L4x4_SUB4: +CGEMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -231,48 +233,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b .LCGEMM_L4x4_SUB1 + b CGEMM_L4x4_SUB1 -.LCGEMM_L4x4_SUB0: +CGEMM_L4x4_SUB0: andi. L, K, 7 KERNEL4x4_SUBI1 addic. L, L, -1 - ble .LCGEMM_L4x4_SAVE - b .LCGEMM_L4x4_SUB2 + ble CGEMM_L4x4_SAVE + b CGEMM_L4x4_SUB2 -.LCGEMM_L4x4_SUB1: +CGEMM_L4x4_SUB1: andi. L, K, 7 - ble .LCGEMM_L4x4_SAVE + ble CGEMM_L4x4_SAVE -.LCGEMM_L4x4_SUB2: +CGEMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt .LCGEMM_L4x4_SUB2 + bgt CGEMM_L4x4_SUB2 -.LCGEMM_L4x4_SAVE: +CGEMM_L4x4_SAVE: SAVE4x4 -.LCGEMM_L4x4_END: +CGEMM_L4x4_END: -.LCGEMM_L4x2_BEGIN: +CGEMM_L4x2_BEGIN: andi. T1, M, 2 - ble .LCGEMM_L4x2_END + ble CGEMM_L4x2_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L4x2_SUB0 + ble CGEMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L4x2_SUB4 + ble CGEMM_L4x2_SUB4 -.LCGEMM_L4x2_LOOP_START: +CGEMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -286,11 +288,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -2 - ble .LCGEMM_L4x2_LOOP_END + ble CGEMM_L4x2_LOOP_END .align 5 -.LCGEMM_L4x2_LOOP: +CGEMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -303,9 +305,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -1 - bgt .LCGEMM_L4x2_LOOP + bgt CGEMM_L4x2_LOOP -.LCGEMM_L4x2_LOOP_END: +CGEMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -317,9 +319,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_1 KERNEL4x2_E2 - b .LCGEMM_L4x2_SUB1 + b CGEMM_L4x2_SUB1 -.LCGEMM_L4x2_SUB4: +CGEMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -331,48 +333,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b .LCGEMM_L4x2_SUB1 + b CGEMM_L4x2_SUB1 -.LCGEMM_L4x2_SUB0: +CGEMM_L4x2_SUB0: andi. L, K, 7 KERNEL4x2_SUBI1 addic. L, L, -1 - ble .LCGEMM_L4x2_SAVE - b .LCGEMM_L4x2_SUB2 + ble CGEMM_L4x2_SAVE + b CGEMM_L4x2_SUB2 -.LCGEMM_L4x2_SUB1: +CGEMM_L4x2_SUB1: andi. L, K, 7 - ble .LCGEMM_L4x2_SAVE + ble CGEMM_L4x2_SAVE -.LCGEMM_L4x2_SUB2: +CGEMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt .LCGEMM_L4x2_SUB2 + bgt CGEMM_L4x2_SUB2 -.LCGEMM_L4x2_SAVE: +CGEMM_L4x2_SAVE: SAVE4x2 -.LCGEMM_L4x2_END: +CGEMM_L4x2_END: -.LCGEMM_L4x1_BEGIN: +CGEMM_L4x1_BEGIN: andi. T1, M, 1 - ble .LCGEMM_L4x1_END + ble CGEMM_L4x1_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L4x1_SUB0 + ble CGEMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L4x1_SUB4 + ble CGEMM_L4x1_SUB4 -.LCGEMM_L4x1_LOOP_START: +CGEMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -386,11 +388,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -2 - ble .LCGEMM_L4x1_LOOP_END + ble CGEMM_L4x1_LOOP_END .align 5 -.LCGEMM_L4x1_LOOP: +CGEMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -403,9 +405,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -1 - bgt .LCGEMM_L4x1_LOOP + bgt CGEMM_L4x1_LOOP -.LCGEMM_L4x1_LOOP_END: +CGEMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -417,9 +419,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_1 KERNEL4x1_E2 - b .LCGEMM_L4x1_SUB1 + b CGEMM_L4x1_SUB1 -.LCGEMM_L4x1_SUB4: +CGEMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -431,74 +433,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b .LCGEMM_L4x1_SUB1 + b CGEMM_L4x1_SUB1 -.LCGEMM_L4x1_SUB0: +CGEMM_L4x1_SUB0: andi. L, K, 7 KERNEL4x1_SUBI1 addic. L, L, -1 - ble .LCGEMM_L4x1_SAVE - b .LCGEMM_L4x1_SUB2 + ble CGEMM_L4x1_SAVE + b CGEMM_L4x1_SUB2 -.LCGEMM_L4x1_SUB1: +CGEMM_L4x1_SUB1: andi. L, K, 7 - ble .LCGEMM_L4x1_SAVE + ble CGEMM_L4x1_SAVE -.LCGEMM_L4x1_SUB2: +CGEMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt .LCGEMM_L4x1_SUB2 + bgt CGEMM_L4x1_SUB2 -.LCGEMM_L4x1_SAVE: +CGEMM_L4x1_SAVE: SAVE4x1 -.LCGEMM_L4x1_END: +CGEMM_L4x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 - bgt .LCGEMM_L4_BEGIN + bgt CGEMM_L4_BEGIN andi. T2, N, 3 - ble .L999_H2 + ble L999_H2 -.LCGEMM_L4_END: +CGEMM_L4_END: - b .LCGEMM_L2_BEGIN + b CGEMM_L2_BEGIN -.L999_H1: +L999_H1: - b .L999_H2 + b L999_H2 -.LCGEMM_L2_BEGIN: +CGEMM_L2_BEGIN: andi. T1, N, 2 - ble .LCGEMM_L2_END + ble CGEMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 3 - ble .LCGEMM_L2x8_END + ble CGEMM_L2x8_END -.LCGEMM_L2x8_BEGIN: +CGEMM_L2x8_BEGIN: mr BO, B srawi. L, K, 3 - ble .LCGEMM_L2x8_SUB0 + ble CGEMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L2x8_SUB4 + ble CGEMM_L2x8_SUB4 -.LCGEMM_L2x8_LOOP_START: +CGEMM_L2x8_LOOP_START: dcbt AO, PRE LOAD2x8_1 @@ -517,11 +519,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -2 - ble .LCGEMM_L2x8_LOOP_END + ble CGEMM_L2x8_LOOP_END .align 5 -.LCGEMM_L2x8_LOOP: +CGEMM_L2x8_LOOP: KERNEL2x8_1 dcbt AO, PRE @@ -538,9 +540,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -1 - bgt .LCGEMM_L2x8_LOOP + bgt CGEMM_L2x8_LOOP -.LCGEMM_L2x8_LOOP_END: +CGEMM_L2x8_LOOP_END: KERNEL2x8_1 dcbt AO, PRE @@ -555,9 +557,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_1 KERNEL2x8_E2 - b .LCGEMM_L2x8_SUB1 + b CGEMM_L2x8_SUB1 -.LCGEMM_L2x8_SUB4: +CGEMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -569,53 +571,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b .LCGEMM_L2x8_SUB1 + b CGEMM_L2x8_SUB1 -.LCGEMM_L2x8_SUB0: +CGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 - ble .LCGEMM_L2x8_SAVE - b .LCGEMM_L2x8_SUB2 + ble CGEMM_L2x8_SAVE + b CGEMM_L2x8_SUB2 -.LCGEMM_L2x8_SUB1: +CGEMM_L2x8_SUB1: andi. L, K, 7 - ble .LCGEMM_L2x8_SAVE + ble CGEMM_L2x8_SAVE -.LCGEMM_L2x8_SUB2: +CGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt .LCGEMM_L2x8_SUB2 + bgt CGEMM_L2x8_SUB2 -.LCGEMM_L2x8_SAVE: +CGEMM_L2x8_SAVE: SAVE2x8 addic. I, I, -1 - bgt .LCGEMM_L2x8_BEGIN + bgt CGEMM_L2x8_BEGIN -.LCGEMM_L2x8_END: +CGEMM_L2x8_END: -.LCGEMM_L2x4_BEGIN: +CGEMM_L2x4_BEGIN: andi. T2, M, 7 - ble .LCGEMM_L2x1_END + ble CGEMM_L2x1_END andi. T1, M, 4 - ble .LCGEMM_L2x4_END + ble CGEMM_L2x4_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L2x4_SUB0 + ble CGEMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L2x4_SUB4 + ble CGEMM_L2x4_SUB4 -.LCGEMM_L2x4_LOOP_START: +CGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -629,11 +631,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -2 - ble .LCGEMM_L2x4_LOOP_END + ble CGEMM_L2x4_LOOP_END .align 5 -.LCGEMM_L2x4_LOOP: +CGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -646,9 +648,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -1 - bgt .LCGEMM_L2x4_LOOP + bgt CGEMM_L2x4_LOOP -.LCGEMM_L2x4_LOOP_END: +CGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -660,9 +662,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_1 KERNEL2x4_E2 - b .LCGEMM_L2x4_SUB1 + b CGEMM_L2x4_SUB1 -.LCGEMM_L2x4_SUB4: +CGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -674,48 +676,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b .LCGEMM_L2x4_SUB1 + b CGEMM_L2x4_SUB1 -.LCGEMM_L2x4_SUB0: +CGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 - ble .LCGEMM_L2x4_SAVE - b .LCGEMM_L2x4_SUB2 + ble CGEMM_L2x4_SAVE + b CGEMM_L2x4_SUB2 -.LCGEMM_L2x4_SUB1: +CGEMM_L2x4_SUB1: andi. L, K, 7 - ble .LCGEMM_L2x4_SAVE + ble CGEMM_L2x4_SAVE -.LCGEMM_L2x4_SUB2: +CGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt .LCGEMM_L2x4_SUB2 + bgt CGEMM_L2x4_SUB2 -.LCGEMM_L2x4_SAVE: +CGEMM_L2x4_SAVE: SAVE2x4 -.LCGEMM_L2x4_END: +CGEMM_L2x4_END: -.LCGEMM_L2x2_BEGIN: +CGEMM_L2x2_BEGIN: andi. T1, M, 2 - ble .LCGEMM_L2x2_END + ble CGEMM_L2x2_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L2x2_SUB0 + ble CGEMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L2x2_SUB4 + ble CGEMM_L2x2_SUB4 -.LCGEMM_L2x2_LOOP_START: +CGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -729,11 +731,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -2 - ble .LCGEMM_L2x2_LOOP_END + ble CGEMM_L2x2_LOOP_END .align 5 -.LCGEMM_L2x2_LOOP: +CGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -746,9 +748,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -1 - bgt .LCGEMM_L2x2_LOOP + bgt CGEMM_L2x2_LOOP -.LCGEMM_L2x2_LOOP_END: +CGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -760,9 +762,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_1 KERNEL2x2_E2 - b .LCGEMM_L2x2_SUB1 + b CGEMM_L2x2_SUB1 -.LCGEMM_L2x2_SUB4: +CGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -774,48 +776,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b .LCGEMM_L2x2_SUB1 + b CGEMM_L2x2_SUB1 -.LCGEMM_L2x2_SUB0: +CGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 - ble .LCGEMM_L2x2_SAVE - b .LCGEMM_L2x2_SUB2 + ble CGEMM_L2x2_SAVE + b CGEMM_L2x2_SUB2 -.LCGEMM_L2x2_SUB1: +CGEMM_L2x2_SUB1: andi. L, K, 7 - ble .LCGEMM_L2x2_SAVE + ble CGEMM_L2x2_SAVE -.LCGEMM_L2x2_SUB2: +CGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt .LCGEMM_L2x2_SUB2 + bgt CGEMM_L2x2_SUB2 -.LCGEMM_L2x2_SAVE: +CGEMM_L2x2_SAVE: SAVE2x2 -.LCGEMM_L2x2_END: +CGEMM_L2x2_END: -.LCGEMM_L2x1_BEGIN: +CGEMM_L2x1_BEGIN: andi. T1, M, 1 - ble .LCGEMM_L2x1_END + ble CGEMM_L2x1_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L2x1_SUB0 + ble CGEMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L2x1_SUB4 + ble CGEMM_L2x1_SUB4 -.LCGEMM_L2x1_LOOP_START: +CGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -829,11 +831,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -2 - ble .LCGEMM_L2x1_LOOP_END + ble CGEMM_L2x1_LOOP_END .align 5 -.LCGEMM_L2x1_LOOP: +CGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -846,9 +848,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -1 - bgt .LCGEMM_L2x1_LOOP + bgt CGEMM_L2x1_LOOP -.LCGEMM_L2x1_LOOP_END: +CGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -860,9 +862,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_1 KERNEL2x1_E2 - b .LCGEMM_L2x1_SUB1 + b CGEMM_L2x1_SUB1 -.LCGEMM_L2x1_SUB4: +CGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -874,66 +876,66 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b .LCGEMM_L2x1_SUB1 + b CGEMM_L2x1_SUB1 -.LCGEMM_L2x1_SUB0: +CGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 - ble .LCGEMM_L2x1_SAVE - b .LCGEMM_L2x1_SUB2 + ble CGEMM_L2x1_SAVE + b CGEMM_L2x1_SUB2 -.LCGEMM_L2x1_SUB1: +CGEMM_L2x1_SUB1: andi. L, K, 7 - ble .LCGEMM_L2x1_SAVE + ble CGEMM_L2x1_SAVE -.LCGEMM_L2x1_SUB2: +CGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt .LCGEMM_L2x1_SUB2 + bgt CGEMM_L2x1_SUB2 -.LCGEMM_L2x1_SAVE: +CGEMM_L2x1_SAVE: SAVE2x1 -.LCGEMM_L2x1_END: +CGEMM_L2x1_END: slwi T1, K, 4 add B, B, T1 -.LCGEMM_L2_END: +CGEMM_L2_END: - b .LCGEMM_L1_BEGIN + b CGEMM_L1_BEGIN -.L999_H2: +L999_H2: - b .L999 + b L999 -.LCGEMM_L1_BEGIN: +CGEMM_L1_BEGIN: andi. T1, N, 1 - ble .LCGEMM_L1_END + ble CGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 3 - ble .LCGEMM_L1x8_END + ble CGEMM_L1x8_END -.LCGEMM_L1x8_BEGIN: +CGEMM_L1x8_BEGIN: mr BO, B srawi. L, K, 3 - ble .LCGEMM_L1x8_SUB0 + ble CGEMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L1x8_SUB4 + ble CGEMM_L1x8_SUB4 -.LCGEMM_L1x8_LOOP_START: +CGEMM_L1x8_LOOP_START: dcbt AO, PRE LOAD1x8_1 @@ -952,11 +954,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -2 - ble .LCGEMM_L1x8_LOOP_END + ble CGEMM_L1x8_LOOP_END .align 5 -.LCGEMM_L1x8_LOOP: +CGEMM_L1x8_LOOP: KERNEL1x8_1 dcbt AO, PRE @@ -973,9 +975,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -1 - bgt .LCGEMM_L1x8_LOOP + bgt CGEMM_L1x8_LOOP -.LCGEMM_L1x8_LOOP_END: +CGEMM_L1x8_LOOP_END: KERNEL1x8_1 dcbt AO, PRE @@ -990,9 +992,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_1 KERNEL1x8_E2 - b .LCGEMM_L1x8_SUB1 + b CGEMM_L1x8_SUB1 -.LCGEMM_L1x8_SUB4: +CGEMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -1004,53 +1006,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b .LCGEMM_L1x8_SUB1 + b CGEMM_L1x8_SUB1 -.LCGEMM_L1x8_SUB0: +CGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 - ble .LCGEMM_L1x8_SAVE - b .LCGEMM_L1x8_SUB2 + ble CGEMM_L1x8_SAVE + b CGEMM_L1x8_SUB2 -.LCGEMM_L1x8_SUB1: +CGEMM_L1x8_SUB1: andi. L, K, 7 - ble .LCGEMM_L1x8_SAVE + ble CGEMM_L1x8_SAVE -.LCGEMM_L1x8_SUB2: +CGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt .LCGEMM_L1x8_SUB2 + bgt CGEMM_L1x8_SUB2 -.LCGEMM_L1x8_SAVE: +CGEMM_L1x8_SAVE: SAVE1x8 addic. I, I, -1 - bgt .LCGEMM_L1x8_BEGIN + bgt CGEMM_L1x8_BEGIN -.LCGEMM_L1x8_END: +CGEMM_L1x8_END: -.LCGEMM_L1x4_BEGIN: +CGEMM_L1x4_BEGIN: andi. T2, M, 7 - ble .LCGEMM_L1x1_END + ble CGEMM_L1x1_END andi. T1, M, 4 - ble .LCGEMM_L1x4_END + ble CGEMM_L1x4_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L1x4_SUB0 + ble CGEMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L1x4_SUB4 + ble CGEMM_L1x4_SUB4 -.LCGEMM_L1x4_LOOP_START: +CGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -1064,11 +1066,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -2 - ble .LCGEMM_L1x4_LOOP_END + ble CGEMM_L1x4_LOOP_END .align 5 -.LCGEMM_L1x4_LOOP: +CGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -1081,9 +1083,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -1 - bgt .LCGEMM_L1x4_LOOP + bgt CGEMM_L1x4_LOOP -.LCGEMM_L1x4_LOOP_END: +CGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -1095,9 +1097,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_1 KERNEL1x4_E2 - b .LCGEMM_L1x4_SUB1 + b CGEMM_L1x4_SUB1 -.LCGEMM_L1x4_SUB4: +CGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -1109,48 +1111,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b .LCGEMM_L1x4_SUB1 + b CGEMM_L1x4_SUB1 -.LCGEMM_L1x4_SUB0: +CGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 - ble .LCGEMM_L1x4_SAVE - b .LCGEMM_L1x4_SUB2 + ble CGEMM_L1x4_SAVE + b CGEMM_L1x4_SUB2 -.LCGEMM_L1x4_SUB1: +CGEMM_L1x4_SUB1: andi. L, K, 7 - ble .LCGEMM_L1x4_SAVE + ble CGEMM_L1x4_SAVE -.LCGEMM_L1x4_SUB2: +CGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt .LCGEMM_L1x4_SUB2 + bgt CGEMM_L1x4_SUB2 -.LCGEMM_L1x4_SAVE: +CGEMM_L1x4_SAVE: SAVE1x4 -.LCGEMM_L1x4_END: +CGEMM_L1x4_END: -.LCGEMM_L1x2_BEGIN: +CGEMM_L1x2_BEGIN: andi. T1, M, 2 - ble .LCGEMM_L1x2_END + ble CGEMM_L1x2_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L1x2_SUB0 + ble CGEMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L1x2_SUB4 + ble CGEMM_L1x2_SUB4 -.LCGEMM_L1x2_LOOP_START: +CGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -1164,11 +1166,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -2 - ble .LCGEMM_L1x2_LOOP_END + ble CGEMM_L1x2_LOOP_END .align 5 -.LCGEMM_L1x2_LOOP: +CGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -1181,9 +1183,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -1 - bgt .LCGEMM_L1x2_LOOP + bgt CGEMM_L1x2_LOOP -.LCGEMM_L1x2_LOOP_END: +CGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -1195,9 +1197,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_1 KERNEL1x2_E2 - b .LCGEMM_L1x2_SUB1 + b CGEMM_L1x2_SUB1 -.LCGEMM_L1x2_SUB4: +CGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -1209,48 +1211,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b .LCGEMM_L1x2_SUB1 + b CGEMM_L1x2_SUB1 -.LCGEMM_L1x2_SUB0: +CGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 - ble .LCGEMM_L1x2_SAVE - b .LCGEMM_L1x2_SUB2 + ble CGEMM_L1x2_SAVE + b CGEMM_L1x2_SUB2 -.LCGEMM_L1x2_SUB1: +CGEMM_L1x2_SUB1: andi. L, K, 7 - ble .LCGEMM_L1x2_SAVE + ble CGEMM_L1x2_SAVE -.LCGEMM_L1x2_SUB2: +CGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt .LCGEMM_L1x2_SUB2 + bgt CGEMM_L1x2_SUB2 -.LCGEMM_L1x2_SAVE: +CGEMM_L1x2_SAVE: SAVE1x2 -.LCGEMM_L1x2_END: +CGEMM_L1x2_END: -.LCGEMM_L1x1_BEGIN: +CGEMM_L1x1_BEGIN: andi. T1, M, 1 - ble .LCGEMM_L1x1_END + ble CGEMM_L1x1_END mr BO, B srawi. L, K, 3 - ble .LCGEMM_L1x1_SUB0 + ble CGEMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble .LCGEMM_L1x1_SUB4 + ble CGEMM_L1x1_SUB4 -.LCGEMM_L1x1_LOOP_START: +CGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -1264,11 +1266,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -2 - ble .LCGEMM_L1x1_LOOP_END + ble CGEMM_L1x1_LOOP_END .align 5 -.LCGEMM_L1x1_LOOP: +CGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -1281,9 +1283,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -1 - bgt .LCGEMM_L1x1_LOOP + bgt CGEMM_L1x1_LOOP -.LCGEMM_L1x1_LOOP_END: +CGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -1295,9 +1297,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_1 KERNEL1x1_E2 - b .LCGEMM_L1x1_SUB1 + b CGEMM_L1x1_SUB1 -.LCGEMM_L1x1_SUB4: +CGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -1309,34 +1311,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b .LCGEMM_L1x1_SUB1 + b CGEMM_L1x1_SUB1 -.LCGEMM_L1x1_SUB0: +CGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 - ble .LCGEMM_L1x1_SAVE - b .LCGEMM_L1x1_SUB2 + ble CGEMM_L1x1_SAVE + b CGEMM_L1x1_SUB2 -.LCGEMM_L1x1_SUB1: +CGEMM_L1x1_SUB1: andi. L, K, 7 - ble .LCGEMM_L1x1_SAVE + ble CGEMM_L1x1_SAVE -.LCGEMM_L1x1_SUB2: +CGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt .LCGEMM_L1x1_SUB2 + bgt CGEMM_L1x1_SUB2 -.LCGEMM_L1x1_SAVE: +CGEMM_L1x1_SAVE: SAVE1x1 -.LCGEMM_L1x1_END: +CGEMM_L1x1_END: -.LCGEMM_L1_END: +CGEMM_L1_END: diff --git a/kernel/power/cgemm_macros_8x4_power8.S b/kernel/power/cgemm_macros_8x4_power8.S index 2085d3764..48a21252c 100644 --- a/kernel/power/cgemm_macros_8x4_power8.S +++ b/kernel/power/cgemm_macros_8x4_power8.S @@ -26,40 +26,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ #if defined(NN) || defined(NT) || defined(TN) || defined(TT) - #define XSFADD_R1 xsaddsp - #define XSFADD_R2 xssubsp - #define XSFADD_I1 xsaddsp - #define XSFADD_I2 xsaddsp + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xsadddp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvsubsp + #define XVFADD_I1 xvaddsp + #define XVFADD_I2 xvaddsp #elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - #define XSFADD_R1 xsaddsp - #define XSFADD_R2 xsaddsp - #define XSFADD_I1 xssubsp - #define XSFADD_I2 xsaddsp + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xsadddp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvaddsp + #define XVFADD_I1 xvsubsp + #define XVFADD_I2 xvaddsp #elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - #define XSFADD_R1 xsaddsp - #define XSFADD_R2 xsaddsp - #define XSFADD_I1 xsaddsp - #define XSFADD_I2 xssubsp + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xssubdp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvaddsp + #define XVFADD_I1 xvaddsp + #define XVFADD_I2 xvsubsp #else // CC || CR || RC || RR - #define XSFADD_R1 xsaddsp - #define XSFADD_R2 xssubsp - #define XSFADD_I1 xssubsp - #define XSFADD_I2 xssubsp + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xssubdp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvsubsp + #define XVFADD_I1 xvsubsp + #define XVFADD_I2 xvsubsp #endif @@ -172,24 +188,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x8_1 - xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r - xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i - lxvw4x vs24, o0, BO // load b0, b1 lxvw4x vs4, o0, AO // load a0, a1 - xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r - xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i - - lxvw4x vs25, o16, BO // load b2, b3 lxvw4x vs5, o16, AO // load a2, a3 - xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r - xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i - lxvw4x vs6, o32, AO // load a4, a5 + lxvw4x vs7, o48, AO // load a6, a7 + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -211,47 +245,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 - xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i - addi BO, BO, 32 xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i - addi AO, AO, 64 xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i - xxspltw vs20, vs25, 0 - xxspltw vs21, vs25, 1 - xxspltw vs22, vs25, 2 - xxspltw vs23, vs25, 3 .endm .macro KERNEL4x8_2 - xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r - xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i - lxvw4x vs24, o0, BO // load b0, b1 lxvw4x vs0, o0, AO // load a0, a1 - xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r - xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 lxvw4x vs25, o16, BO // load b2, b3 - lxvw4x vs1, o16, AO // load a2, a3 + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -273,26 +316,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i - addi AO, AO, 64 xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i - addi BO, BO, 32 xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 .endm @@ -501,51 +533,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -566,51 +599,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -631,51 +665,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs36, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 - stxvw4x vs37, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -696,51 +731,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs38, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 - stxvw4x vs39, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -767,51 +803,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs40, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 - stxvw4x vs41, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -832,51 +869,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs42, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 - stxvw4x vs43, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -897,51 +935,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs44, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 - stxvw4x vs45, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -962,51 +1001,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs46, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 - stxvw4x vs47, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1033,51 +1073,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs48, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs48, 0 + xxspltw vs9, vs48, 1 + xxspltw vs10, vs48, 2 + xxspltw vs11, vs48, 3 - stxvw4x vs49, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs49, 0 + xxspltw vs13, vs49, 1 + xxspltw vs14, vs49, 2 + xxspltw vs15, vs49, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1098,51 +1139,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs50, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs50, 0 + xxspltw vs9, vs50, 1 + xxspltw vs10, vs50, 2 + xxspltw vs11, vs50, 3 - stxvw4x vs51, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs51, 0 + xxspltw vs13, vs51, 1 + xxspltw vs14, vs51, 2 + xxspltw vs15, vs51, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1163,51 +1205,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs52, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs52, 0 + xxspltw vs9, vs52, 1 + xxspltw vs10, vs52, 2 + xxspltw vs11, vs52, 3 - stxvw4x vs53, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs53, 0 + xxspltw vs13, vs53, 1 + xxspltw vs14, vs53, 2 + xxspltw vs15, vs53, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1228,51 +1271,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs54, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs54, 0 + xxspltw vs9, vs54, 1 + xxspltw vs10, vs54, 2 + xxspltw vs11, vs54, 3 - stxvw4x vs55, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs55, 0 + xxspltw vs13, vs55, 1 + xxspltw vs14, vs55, 2 + xxspltw vs15, vs55, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1299,51 +1343,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs56, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs56, 0 + xxspltw vs9, vs56, 1 + xxspltw vs10, vs56, 2 + xxspltw vs11, vs56, 3 - stxvw4x vs57, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs57, 0 + xxspltw vs13, vs57, 1 + xxspltw vs14, vs57, 2 + xxspltw vs15, vs57, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1364,51 +1409,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs58, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs58, 0 + xxspltw vs9, vs58, 1 + xxspltw vs10, vs58, 2 + xxspltw vs11, vs58, 3 - stxvw4x vs59, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs59, 0 + xxspltw vs13, vs59, 1 + xxspltw vs14, vs59, 2 + xxspltw vs15, vs59, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1429,51 +1475,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs60, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs60, 0 + xxspltw vs9, vs60, 1 + xxspltw vs10, vs60, 2 + xxspltw vs11, vs60, 3 - stxvw4x vs61, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs61, 0 + xxspltw vs13, vs61, 1 + xxspltw vs14, vs61, 2 + xxspltw vs15, vs61, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1494,51 +1541,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs62, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs62, 0 + xxspltw vs9, vs62, 1 + xxspltw vs10, vs62, 2 + xxspltw vs11, vs62, 3 - stxvw4x vs63, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs63, 0 + xxspltw vs13, vs63, 1 + xxspltw vs14, vs63, 2 + xxspltw vs15, vs63, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1886,51 +1934,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -1951,51 +2000,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2022,51 +2072,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs36, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 - stxvw4x vs37, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2087,51 +2138,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs38, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 - stxvw4x vs39, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2158,51 +2210,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs40, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 - stxvw4x vs41, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2223,51 +2276,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs42, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 - stxvw4x vs43, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2294,51 +2348,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs44, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 - stxvw4x vs45, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2359,51 +2414,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs46, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 - stxvw4x vs47, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2691,51 +2747,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2762,51 +2819,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2833,51 +2891,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs36, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 - stxvw4x vs37, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -2904,51 +2963,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs38, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 - stxvw4x vs39, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -3028,25 +3088,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 - xsmulsp vs32, vs0, vs8 // a0_r*b0_r - xsmulsp vs33, vs1, vs9 // a0_i*b0_i - xsmulsp vs34, vs0, vs9 // a0_r*b0_i - xsmulsp vs35, vs1, vs8 // a0_i*b0_r + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r - xsmulsp vs36, vs0, vs10 // a0_r*b1_r - xsmulsp vs37, vs1, vs11 // a0_i*b1_i - xsmulsp vs38, vs0, vs11 // a0_r*b1_i - xsmulsp vs39, vs1, vs10 // a0_i*b1_r + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r - xsmulsp vs40, vs0, vs12 // a0_r*b2_r - xsmulsp vs41, vs1, vs13 // a0_i*b2_i - xsmulsp vs42, vs0, vs13 // a0_r*b2_i - xsmulsp vs43, vs1, vs12 // a0_i*b2_r + xsmuldp vs40, vs0, vs12 // a0_r*b2_r + xsmuldp vs41, vs1, vs13 // a0_i*b2_i + xsmuldp vs42, vs0, vs13 // a0_r*b2_i + xsmuldp vs43, vs1, vs12 // a0_i*b2_r - xsmulsp vs44, vs0, vs14 // a0_r*b3_r - xsmulsp vs45, vs1, vs15 // a0_i*b3_i - xsmulsp vs46, vs0, vs15 // a0_r*b3_i - xsmulsp vs47, vs1, vs14 // a0_i*b3_r + xsmuldp vs44, vs0, vs14 // a0_r*b3_r + xsmuldp vs45, vs1, vs15 // a0_i*b3_i + xsmuldp vs46, vs0, vs15 // a0_r*b3_i + xsmuldp vs47, vs1, vs14 // a0_i*b3_r .endm @@ -3082,25 +3142,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 - xsmaddasp vs32, vs0, vs8 // a0_r*b0_r - xsmaddasp vs33, vs1, vs9 // a0_i*b0_i - xsmaddasp vs34, vs0, vs9 // a0_r*b0_i - xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r - xsmaddasp vs36, vs0, vs10 // a0_r*b1_r - xsmaddasp vs37, vs1, vs11 // a0_i*b1_i - xsmaddasp vs38, vs0, vs11 // a0_r*b1_i - xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r - xsmaddasp vs40, vs0, vs12 // a0_r*b2_r - xsmaddasp vs41, vs1, vs13 // a0_i*b2_i - xsmaddasp vs42, vs0, vs13 // a0_r*b2_i - xsmaddasp vs43, vs1, vs12 // a0_i*b2_r + xsmaddadp vs40, vs0, vs12 // a0_r*b2_r + xsmaddadp vs41, vs1, vs13 // a0_i*b2_i + xsmaddadp vs42, vs0, vs13 // a0_r*b2_i + xsmaddadp vs43, vs1, vs12 // a0_i*b2_r - xsmaddasp vs44, vs0, vs14 // a0_r*b3_r - xsmaddasp vs45, vs1, vs15 // a0_i*b3_i - xsmaddasp vs46, vs0, vs15 // a0_r*b3_i - xsmaddasp vs47, vs1, vs14 // a0_i*b3_r + xsmaddadp vs44, vs0, vs14 // a0_r*b3_r + xsmaddadp vs45, vs1, vs15 // a0_i*b3_i + xsmaddadp vs46, vs0, vs15 // a0_r*b3_i + xsmaddadp vs47, vs1, vs14 // a0_i*b3_r .endm @@ -3136,25 +3196,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 - xsmaddasp vs32, vs4, vs16 // a4_r*b0_r - xsmaddasp vs33, vs5, vs17 // a4_i*b0_i - xsmaddasp vs34, vs4, vs17 // a4_r*b0_i - xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r - xsmaddasp vs36, vs4, vs18 // a4_r*b1_r - xsmaddasp vs37, vs5, vs19 // a4_i*b1_i - xsmaddasp vs38, vs4, vs19 // a4_r*b1_i - xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r - xsmaddasp vs40, vs4, vs20 // a4_r*b2_r - xsmaddasp vs41, vs5, vs21 // a4_i*b2_i - xsmaddasp vs42, vs4, vs21 // a4_r*b2_i - xsmaddasp vs43, vs5, vs20 // a4_i*b2_r + xsmaddadp vs40, vs4, vs20 // a4_r*b2_r + xsmaddadp vs41, vs5, vs21 // a4_i*b2_i + xsmaddadp vs42, vs4, vs21 // a4_r*b2_i + xsmaddadp vs43, vs5, vs20 // a4_i*b2_r - xsmaddasp vs44, vs4, vs22 // a4_r*b3_r - xsmaddasp vs45, vs5, vs23 // a4_i*b3_i - xsmaddasp vs46, vs4, vs23 // a4_r*b3_i - xsmaddasp vs47, vs5, vs22 // a4_i*b3_r + xsmaddadp vs44, vs4, vs22 // a4_r*b3_r + xsmaddadp vs45, vs5, vs23 // a4_i*b3_i + xsmaddadp vs46, vs4, vs23 // a4_r*b3_i + xsmaddadp vs47, vs5, vs22 // a4_i*b3_r .endm @@ -3162,25 +3222,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x1_E2 - xsmaddasp vs32, vs4, vs16 // a4_r*b0_r - xsmaddasp vs33, vs5, vs17 // a4_i*b0_i - xsmaddasp vs34, vs4, vs17 // a4_r*b0_i - xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r - xsmaddasp vs36, vs4, vs18 // a4_r*b1_r - xsmaddasp vs37, vs5, vs19 // a4_i*b1_i - xsmaddasp vs38, vs4, vs19 // a4_r*b1_i - xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r - xsmaddasp vs40, vs4, vs20 // a4_r*b2_r - xsmaddasp vs41, vs5, vs21 // a4_i*b2_i - xsmaddasp vs42, vs4, vs21 // a4_r*b2_i - xsmaddasp vs43, vs5, vs20 // a4_i*b2_r + xsmaddadp vs40, vs4, vs20 // a4_r*b2_r + xsmaddadp vs41, vs5, vs21 // a4_i*b2_i + xsmaddadp vs42, vs4, vs21 // a4_r*b2_i + xsmaddadp vs43, vs5, vs20 // a4_i*b2_r - xsmaddasp vs44, vs4, vs22 // a4_r*b3_r - xsmaddasp vs45, vs5, vs23 // a4_i*b3_i - xsmaddasp vs46, vs4, vs23 // a4_r*b3_i - xsmaddasp vs47, vs5, vs22 // a4_i*b3_r + xsmaddadp vs44, vs4, vs22 // a4_r*b3_r + xsmaddadp vs45, vs5, vs23 // a4_i*b3_i + xsmaddadp vs46, vs4, vs23 // a4_r*b3_i + xsmaddadp vs47, vs5, vs22 // a4_i*b3_r .endm @@ -3216,25 +3276,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 - xsmulsp vs32, vs0, vs8 // a0_r*b0_r - xsmulsp vs33, vs1, vs9 // a0_i*b0_i - xsmulsp vs34, vs0, vs9 // a0_r*b0_i - xsmulsp vs35, vs1, vs8 // a0_i*b0_r + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r - xsmulsp vs36, vs0, vs10 // a0_r*b1_r - xsmulsp vs37, vs1, vs11 // a0_i*b1_i - xsmulsp vs38, vs0, vs11 // a0_r*b1_i - xsmulsp vs39, vs1, vs10 // a0_i*b1_r + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r - xsmulsp vs40, vs0, vs12 // a0_r*b2_r - xsmulsp vs41, vs1, vs13 // a0_i*b2_i - xsmulsp vs42, vs0, vs13 // a0_r*b2_i - xsmulsp vs43, vs1, vs12 // a0_i*b2_r + xsmuldp vs40, vs0, vs12 // a0_r*b2_r + xsmuldp vs41, vs1, vs13 // a0_i*b2_i + xsmuldp vs42, vs0, vs13 // a0_r*b2_i + xsmuldp vs43, vs1, vs12 // a0_i*b2_r - xsmulsp vs44, vs0, vs14 // a0_r*b3_r - xsmulsp vs45, vs1, vs15 // a0_i*b3_i - xsmulsp vs46, vs0, vs15 // a0_r*b3_i - xsmulsp vs47, vs1, vs14 // a0_i*b3_r + xsmuldp vs44, vs0, vs14 // a0_r*b3_r + xsmuldp vs45, vs1, vs15 // a0_i*b3_i + xsmuldp vs46, vs0, vs15 // a0_r*b3_i + xsmuldp vs47, vs1, vs14 // a0_i*b3_r .endm @@ -3270,25 +3330,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 - xsmaddasp vs32, vs0, vs8 // a0_r*b0_r - xsmaddasp vs33, vs1, vs9 // a0_i*b0_i - xsmaddasp vs34, vs0, vs9 // a0_r*b0_i - xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r - xsmaddasp vs36, vs0, vs10 // a0_r*b1_r - xsmaddasp vs37, vs1, vs11 // a0_i*b1_i - xsmaddasp vs38, vs0, vs11 // a0_r*b1_i - xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r - xsmaddasp vs40, vs0, vs12 // a0_r*b2_r - xsmaddasp vs41, vs1, vs13 // a0_i*b2_i - xsmaddasp vs42, vs0, vs13 // a0_r*b2_i - xsmaddasp vs43, vs1, vs12 // a0_i*b2_r + xsmaddadp vs40, vs0, vs12 // a0_r*b2_r + xsmaddadp vs41, vs1, vs13 // a0_i*b2_i + xsmaddadp vs42, vs0, vs13 // a0_r*b2_i + xsmaddadp vs43, vs1, vs12 // a0_i*b2_r - xsmaddasp vs44, vs0, vs14 // a0_r*b3_r - xsmaddasp vs45, vs1, vs15 // a0_i*b3_i - xsmaddasp vs46, vs0, vs15 // a0_r*b3_i - xsmaddasp vs47, vs1, vs14 // a0_i*b3_r + xsmaddadp vs44, vs0, vs14 // a0_r*b3_r + xsmaddadp vs45, vs1, vs15 // a0_i*b3_i + xsmaddadp vs46, vs0, vs15 // a0_r*b3_i + xsmaddadp vs47, vs1, vs14 // a0_i*b3_r .endm @@ -3320,16 +3380,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsaddsp vs0, vs0, vs20 - xsaddsp vs1, vs1, vs21 + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r @@ -3362,16 +3422,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsaddsp vs0, vs0, vs20 - xsaddsp vs1, vs1, vs21 + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r @@ -3404,16 +3464,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. XSFADD_R2 vs4, vs4, vs41 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs42 // add a0_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsaddsp vs0, vs0, vs20 - xsaddsp vs1, vs1, vs21 + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r @@ -3446,16 +3506,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. XSFADD_R2 vs4, vs4, vs45 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs46 // add a0_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsaddsp vs0, vs0, vs20 - xsaddsp vs1, vs1, vs21 + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r @@ -3773,51 +3833,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -3838,51 +3899,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -3903,51 +3965,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs36, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 - stxvw4x vs37, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -3968,51 +4031,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs38, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 - stxvw4x vs39, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4039,51 +4103,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs40, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 - stxvw4x vs41, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4104,51 +4169,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs42, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 - stxvw4x vs43, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4169,51 +4235,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs44, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 - stxvw4x vs45, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4234,51 +4301,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs46, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 - stxvw4x vs47, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4524,51 +4592,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4589,51 +4658,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4660,51 +4730,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs36, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 - stxvw4x vs37, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4725,51 +4796,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs38, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 - stxvw4x vs39, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -4979,51 +5051,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -5050,51 +5123,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -5154,15 +5228,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 - xsmulsp vs32, vs0, vs8 // a0_r*b0_r - xsmulsp vs33, vs1, vs9 // a0_i*b0_i - xsmulsp vs34, vs0, vs9 // a0_r*b0_i - xsmulsp vs35, vs1, vs8 // a0_i*b0_r + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r - xsmulsp vs36, vs0, vs10 // a0_r*b1_r - xsmulsp vs37, vs1, vs11 // a0_i*b1_i - xsmulsp vs38, vs0, vs11 // a0_r*b1_i - xsmulsp vs39, vs1, vs10 // a0_i*b1_r + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r .endm @@ -5188,15 +5262,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 - xsmaddasp vs32, vs0, vs8 // a0_r*b0_r - xsmaddasp vs33, vs1, vs9 // a0_i*b0_i - xsmaddasp vs34, vs0, vs9 // a0_r*b0_i - xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r - xsmaddasp vs36, vs0, vs10 // a0_r*b1_r - xsmaddasp vs37, vs1, vs11 // a0_i*b1_i - xsmaddasp vs38, vs0, vs11 // a0_r*b1_i - xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r .endm @@ -5222,15 +5296,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 - xsmaddasp vs32, vs4, vs16 // a4_r*b0_r - xsmaddasp vs33, vs5, vs17 // a4_i*b0_i - xsmaddasp vs34, vs4, vs17 // a4_r*b0_i - xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r - xsmaddasp vs36, vs4, vs18 // a4_r*b1_r - xsmaddasp vs37, vs5, vs19 // a4_i*b1_i - xsmaddasp vs38, vs4, vs19 // a4_r*b1_i - xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r .endm @@ -5238,15 +5312,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x1_E2 - xsmaddasp vs32, vs4, vs16 // a4_r*b0_r - xsmaddasp vs33, vs5, vs17 // a4_i*b0_i - xsmaddasp vs34, vs4, vs17 // a4_r*b0_i - xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r - xsmaddasp vs36, vs4, vs18 // a4_r*b1_r - xsmaddasp vs37, vs5, vs19 // a4_i*b1_i - xsmaddasp vs38, vs4, vs19 // a4_r*b1_i - xsmaddasp vs39, vs5, vs18 // a4_i*b1_r + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r .endm @@ -5272,15 +5346,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 - xsmulsp vs32, vs0, vs8 // a0_r*b0_r - xsmulsp vs33, vs1, vs9 // a0_i*b0_i - xsmulsp vs34, vs0, vs9 // a0_r*b0_i - xsmulsp vs35, vs1, vs8 // a0_i*b0_r + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r - xsmulsp vs36, vs0, vs10 // a0_r*b1_r - xsmulsp vs37, vs1, vs11 // a0_i*b1_i - xsmulsp vs38, vs0, vs11 // a0_r*b1_i - xsmulsp vs39, vs1, vs10 // a0_i*b1_r + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r .endm @@ -5306,15 +5380,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 - xsmaddasp vs32, vs0, vs8 // a0_r*b0_r - xsmaddasp vs33, vs1, vs9 // a0_i*b0_i - xsmaddasp vs34, vs0, vs9 // a0_r*b0_i - xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r - xsmaddasp vs36, vs0, vs10 // a0_r*b1_r - xsmaddasp vs37, vs1, vs11 // a0_i*b1_i - xsmaddasp vs38, vs0, vs11 // a0_r*b1_i - xsmaddasp vs39, vs1, vs10 // a0_i*b1_r + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r .endm @@ -5346,16 +5420,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsaddsp vs0, vs0, vs20 - xsaddsp vs1, vs1, vs21 + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r @@ -5388,16 +5462,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsaddsp vs0, vs0, vs20 - xsaddsp vs1, vs1, vs21 + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r @@ -5673,51 +5747,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -5738,51 +5813,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -5803,51 +5879,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs36, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 - stxvw4x vs37, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -5868,51 +5945,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs38, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 - stxvw4x vs39, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -6140,51 +6218,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -6205,51 +6284,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs34, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 - stxvw4x vs35, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -6453,51 +6533,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 #endif - stxvw4x vs32, o0, TBUFFER - lxsspx vs8, o0, TBUFFER - lxsspx vs9, o4, TBUFFER - lxsspx vs10, o8, TBUFFER - lxsspx vs11, o12, TBUFFER + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 - stxvw4x vs33, o0, TBUFFER - lxsspx vs12, o0, TBUFFER - lxsspx vs13, o4, TBUFFER - lxsspx vs14, o8, TBUFFER - lxsspx vs15, o12, TBUFFER + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 - XSFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r - XSFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i - XSFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r - XSFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i - XSFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i - XSFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r - XSFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i - XSFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsmulsp vs16, vs6, alpha_r // r1_r * alpha_r - xsmulsp vs17, vs7, alpha_i // r1_i * alpha_i - xsmulsp vs18, vs6, alpha_i // r1_r * alpha_i - xsmulsp vs19, vs7, alpha_r // r1_i * alpha_r + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r - xssubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i - xsaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - stxsspx vs20, o0, TBUFFER // store r0_r - stxsspx vs21, o4, TBUFFER // store r0_i - stxsspx vs22, o8, TBUFFER // store r1_r - stxsspx vs23, o12, TBUFFER // store r1_i - lxvw4x vs1, o0, TBUFFER // load r0_r, r0_i, r1_r, r1_i + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i xvaddsp vs0, vs0, vs1 @@ -6547,10 +6628,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 - xsmulsp vs32, vs0, vs8 // a0_r*b0_r - xsmulsp vs33, vs1, vs9 // a0_i*b0_i - xsmulsp vs34, vs0, vs9 // a0_r*b0_i - xsmulsp vs35, vs1, vs8 // a0_i*b0_r + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r .endm @@ -6571,10 +6652,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 - xsmaddasp vs32, vs0, vs8 // a0_r*b0_r - xsmaddasp vs33, vs1, vs9 // a0_i*b0_i - xsmaddasp vs34, vs0, vs9 // a0_r*b0_i - xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r .endm @@ -6595,10 +6676,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 - xsmaddasp vs32, vs4, vs16 // a4_r*b0_r - xsmaddasp vs33, vs5, vs17 // a4_i*b0_i - xsmaddasp vs34, vs4, vs17 // a4_r*b0_i - xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r .endm @@ -6606,10 +6687,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL1x1_E2 - xsmaddasp vs32, vs4, vs16 // a4_r*b0_r - xsmaddasp vs33, vs5, vs17 // a4_i*b0_i - xsmaddasp vs34, vs4, vs17 // a4_r*b0_i - xsmaddasp vs35, vs5, vs16 // a4_i*b0_r + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r .endm @@ -6630,10 +6711,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 - xsmulsp vs32, vs0, vs8 // a0_r*b0_r - xsmulsp vs33, vs1, vs9 // a0_i*b0_i - xsmulsp vs34, vs0, vs9 // a0_r*b0_i - xsmulsp vs35, vs1, vs8 // a0_i*b0_r + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r .endm @@ -6654,10 +6735,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 - xsmaddasp vs32, vs0, vs8 // a0_r*b0_r - xsmaddasp vs33, vs1, vs9 // a0_i*b0_i - xsmaddasp vs34, vs0, vs9 // a0_r*b0_i - xsmaddasp vs35, vs1, vs8 // a0_i*b0_r + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r .endm @@ -6689,16 +6770,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r - xsmulsp vs16, vs4, alpha_r // r0_r * alpha_r - xsmulsp vs17, vs5, alpha_i // r0_i * alpha_i - xsmulsp vs18, vs4, alpha_i // r0_r * alpha_i - xsmulsp vs19, vs5, alpha_r // r0_i * alpha_r + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r - xssubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i - xsaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r - xsaddsp vs0, vs0, vs20 - xsaddsp vs1, vs1, vs21 + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 stxsspx vs0, o0, T2 // store c0_r diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S index b15485751..b202114dd 100644 --- a/kernel/power/ctrmm_kernel_8x4_power8.S +++ b/kernel/power/ctrmm_kernel_8x4_power8.S @@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/03 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ /*********************************************************************/ @@ -129,18 +129,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define o0 0 -#define alpha_r vs30 -#define alpha_i vs31 -#define alpha_vr vs28 -#define alpha_vi vs29 +#define alpha_dr vs28 +#define alpha_di vs29 +#define alpha_sr vs30 +#define alpha_si vs31 #define o12 r12 #define KKK r13 #define K1 r14 #define L r15 #define o16 r16 -#define TBUFFER r17 +#define NOTUSED r17 #define T2 r19 #define KK r20 #define o8 r21 @@ -278,21 +278,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cgemm_macros_8x4_power8.S" cmpwi cr0, M, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, N, 0 - ble .L999_H1 + ble L999_H1 cmpwi cr0, K, 0 - ble .L999_H1 + ble L999_H1 slwi LDC, LDC, ZBASE_SHIFT - li PRE, 256 + li PRE, 384 li o4 , 4 li o8 , 8 li o12 , 12 li o16 , 16 li o32 , 32 li o48 , 48 - addi TBUFFER, SP, 360 #ifdef __64BIT__ @@ -301,14 +300,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi T1, SP, 224 #endif - lxsspx alpha_r, 0, T1 - lxsspx alpha_i, o8, T1 + lxsspx alpha_dr, 0, T1 + lxsspx alpha_di, o8, T1 + addi T1, SP, 360 + li T2, 0 + + stw T2, 0(T1) + stw T2, 4(T1) + stw T2, 8(T1) + stxsspx alpha_dr, o12, T1 + lxvw4x alpha_sr, o0 , T1 + addi T1, T1, 16 + + stw T2, 0(T1) + stw T2, 4(T1) + stw T2, 8(T1) + stxsspx alpha_di, o12, T1 + lxvw4x alpha_si, o0 , T1 .align 5 #include "ctrmm_logic_8x4_power8.S" -.L999: +L999: addi r3, 0, 0 lfd f14, 0(SP) diff --git a/kernel/power/ctrmm_logic_8x4_power8.S b/kernel/power/ctrmm_logic_8x4_power8.S index f9656e90b..3e50646b0 100644 --- a/kernel/power/ctrmm_logic_8x4_power8.S +++ b/kernel/power/ctrmm_logic_8x4_power8.S @@ -26,18 +26,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/03/18 Werner Saar (wernsaar@googlemail.com) +* 2016/04/03 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK -* LAPACK-TEST : OK +* LAPACK-TEST : OK **************************************************************************************/ - srawi. J, N, 2 - ble .LCTRMM_L4_END + ble CTRMM_L4_END -.LCTRMM_L4_BEGIN: +CTRMM_L4_BEGIN: mr CO, C mr AO, A @@ -49,9 +48,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif srawi. I, M, 3 - ble .LCTRMM_L4x8_END + ble CTRMM_L4x8_END -.LCTRMM_L4x8_BEGIN: +CTRMM_L4x8_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -78,11 +77,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L4x8_SUB0 + ble CTRMM_L4x8_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L4x8_SUB4 + ble CTRMM_L4x8_SUB4 -.LCTRMM_L4x8_LOOP_START: +CTRMM_L4x8_LOOP_START: LOAD4x8_1 KERNEL4x8_I1 @@ -96,11 +95,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -2 - ble .LCTRMM_L4x8_LOOP_END + ble CTRMM_L4x8_LOOP_END .align 5 -.LCTRMM_L4x8_LOOP: +CTRMM_L4x8_LOOP: KERNEL4x8_1 KERNEL4x8_2 @@ -113,9 +112,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_2 addic. L, L, -1 - bgt .LCTRMM_L4x8_LOOP + bgt CTRMM_L4x8_LOOP -.LCTRMM_L4x8_LOOP_END: +CTRMM_L4x8_LOOP_END: KERNEL4x8_1 KERNEL4x8_2 @@ -127,9 +126,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_1 KERNEL4x8_E2 - b .LCTRMM_L4x8_SUB1 + b CTRMM_L4x8_SUB1 -.LCTRMM_L4x8_SUB4: +CTRMM_L4x8_SUB4: KERNEL4x8_SUBI1 KERNEL4x8_SUB1 @@ -141,31 +140,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x8_SUB1 KERNEL4x8_SUB1 - b .LCTRMM_L4x8_SUB1 + b CTRMM_L4x8_SUB1 -.LCTRMM_L4x8_SUB0: +CTRMM_L4x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x8_SUBI1 addic. L, L, -1 - ble .LCTRMM_L4x8_SAVE - b .LCTRMM_L4x8_SUB2 + ble CTRMM_L4x8_SAVE + b CTRMM_L4x8_SUB2 -.LCTRMM_L4x8_SUB1: +CTRMM_L4x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L4x8_SAVE + ble CTRMM_L4x8_SAVE -.LCTRMM_L4x8_SUB2: +CTRMM_L4x8_SUB2: KERNEL4x8_SUB1 addic. L, L, -1 - bgt .LCTRMM_L4x8_SUB2 + bgt CTRMM_L4x8_SUB2 -.LCTRMM_L4x8_SAVE: +CTRMM_L4x8_SAVE: SAVE4x8 @@ -183,16 +182,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. I, I, -1 - bgt .LCTRMM_L4x8_BEGIN + bgt CTRMM_L4x8_BEGIN -.LCTRMM_L4x8_END: +CTRMM_L4x8_END: -.LCTRMM_L4x4_BEGIN: +CTRMM_L4x4_BEGIN: andi. T2, M, 7 - ble .LCTRMM_L4x1_END + ble CTRMM_L4x1_END andi. T1, M, 4 - ble .LCTRMM_L4x4_END + ble CTRMM_L4x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -218,11 +217,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L4x4_SUB0 + ble CTRMM_L4x4_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L4x4_SUB4 + ble CTRMM_L4x4_SUB4 -.LCTRMM_L4x4_LOOP_START: +CTRMM_L4x4_LOOP_START: LOAD4x4_1 KERNEL4x4_I1 @@ -236,11 +235,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -2 - ble .LCTRMM_L4x4_LOOP_END + ble CTRMM_L4x4_LOOP_END .align 5 -.LCTRMM_L4x4_LOOP: +CTRMM_L4x4_LOOP: KERNEL4x4_1 KERNEL4x4_2 @@ -253,9 +252,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_2 addic. L, L, -1 - bgt .LCTRMM_L4x4_LOOP + bgt CTRMM_L4x4_LOOP -.LCTRMM_L4x4_LOOP_END: +CTRMM_L4x4_LOOP_END: KERNEL4x4_1 KERNEL4x4_2 @@ -267,9 +266,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_1 KERNEL4x4_E2 - b .LCTRMM_L4x4_SUB1 + b CTRMM_L4x4_SUB1 -.LCTRMM_L4x4_SUB4: +CTRMM_L4x4_SUB4: KERNEL4x4_SUBI1 KERNEL4x4_SUB1 @@ -281,31 +280,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x4_SUB1 KERNEL4x4_SUB1 - b .LCTRMM_L4x4_SUB1 + b CTRMM_L4x4_SUB1 -.LCTRMM_L4x4_SUB0: +CTRMM_L4x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x4_SUBI1 addic. L, L, -1 - ble .LCTRMM_L4x4_SAVE - b .LCTRMM_L4x4_SUB2 + ble CTRMM_L4x4_SAVE + b CTRMM_L4x4_SUB2 -.LCTRMM_L4x4_SUB1: +CTRMM_L4x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L4x4_SAVE + ble CTRMM_L4x4_SAVE -.LCTRMM_L4x4_SUB2: +CTRMM_L4x4_SUB2: KERNEL4x4_SUB1 addic. L, L, -1 - bgt .LCTRMM_L4x4_SUB2 + bgt CTRMM_L4x4_SUB2 -.LCTRMM_L4x4_SAVE: +CTRMM_L4x4_SAVE: SAVE4x4 @@ -322,12 +321,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L4x4_END: +CTRMM_L4x4_END: -.LCTRMM_L4x2_BEGIN: +CTRMM_L4x2_BEGIN: andi. T1, M, 2 - ble .LCTRMM_L4x2_END + ble CTRMM_L4x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -353,11 +352,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L4x2_SUB0 + ble CTRMM_L4x2_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L4x2_SUB4 + ble CTRMM_L4x2_SUB4 -.LCTRMM_L4x2_LOOP_START: +CTRMM_L4x2_LOOP_START: LOAD4x2_1 KERNEL4x2_I1 @@ -371,11 +370,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -2 - ble .LCTRMM_L4x2_LOOP_END + ble CTRMM_L4x2_LOOP_END .align 5 -.LCTRMM_L4x2_LOOP: +CTRMM_L4x2_LOOP: KERNEL4x2_1 KERNEL4x2_2 @@ -388,9 +387,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_2 addic. L, L, -1 - bgt .LCTRMM_L4x2_LOOP + bgt CTRMM_L4x2_LOOP -.LCTRMM_L4x2_LOOP_END: +CTRMM_L4x2_LOOP_END: KERNEL4x2_1 KERNEL4x2_2 @@ -402,9 +401,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_1 KERNEL4x2_E2 - b .LCTRMM_L4x2_SUB1 + b CTRMM_L4x2_SUB1 -.LCTRMM_L4x2_SUB4: +CTRMM_L4x2_SUB4: KERNEL4x2_SUBI1 KERNEL4x2_SUB1 @@ -416,31 +415,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x2_SUB1 KERNEL4x2_SUB1 - b .LCTRMM_L4x2_SUB1 + b CTRMM_L4x2_SUB1 -.LCTRMM_L4x2_SUB0: +CTRMM_L4x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x2_SUBI1 addic. L, L, -1 - ble .LCTRMM_L4x2_SAVE - b .LCTRMM_L4x2_SUB2 + ble CTRMM_L4x2_SAVE + b CTRMM_L4x2_SUB2 -.LCTRMM_L4x2_SUB1: +CTRMM_L4x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L4x2_SAVE + ble CTRMM_L4x2_SAVE -.LCTRMM_L4x2_SUB2: +CTRMM_L4x2_SUB2: KERNEL4x2_SUB1 addic. L, L, -1 - bgt .LCTRMM_L4x2_SUB2 + bgt CTRMM_L4x2_SUB2 -.LCTRMM_L4x2_SAVE: +CTRMM_L4x2_SAVE: SAVE4x2 @@ -457,12 +456,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L4x2_END: +CTRMM_L4x2_END: -.LCTRMM_L4x1_BEGIN: +CTRMM_L4x1_BEGIN: andi. T1, M, 1 - ble .LCTRMM_L4x1_END + ble CTRMM_L4x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -488,11 +487,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L4x1_SUB0 + ble CTRMM_L4x1_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L4x1_SUB4 + ble CTRMM_L4x1_SUB4 -.LCTRMM_L4x1_LOOP_START: +CTRMM_L4x1_LOOP_START: LOAD4x1_1 KERNEL4x1_I1 @@ -506,11 +505,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -2 - ble .LCTRMM_L4x1_LOOP_END + ble CTRMM_L4x1_LOOP_END .align 5 -.LCTRMM_L4x1_LOOP: +CTRMM_L4x1_LOOP: KERNEL4x1_1 KERNEL4x1_2 @@ -523,9 +522,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_2 addic. L, L, -1 - bgt .LCTRMM_L4x1_LOOP + bgt CTRMM_L4x1_LOOP -.LCTRMM_L4x1_LOOP_END: +CTRMM_L4x1_LOOP_END: KERNEL4x1_1 KERNEL4x1_2 @@ -537,9 +536,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_1 KERNEL4x1_E2 - b .LCTRMM_L4x1_SUB1 + b CTRMM_L4x1_SUB1 -.LCTRMM_L4x1_SUB4: +CTRMM_L4x1_SUB4: KERNEL4x1_SUBI1 KERNEL4x1_SUB1 @@ -551,31 +550,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL4x1_SUB1 KERNEL4x1_SUB1 - b .LCTRMM_L4x1_SUB1 + b CTRMM_L4x1_SUB1 -.LCTRMM_L4x1_SUB0: +CTRMM_L4x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL4x1_SUBI1 addic. L, L, -1 - ble .LCTRMM_L4x1_SAVE - b .LCTRMM_L4x1_SUB2 + ble CTRMM_L4x1_SAVE + b CTRMM_L4x1_SUB2 -.LCTRMM_L4x1_SUB1: +CTRMM_L4x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L4x1_SAVE + ble CTRMM_L4x1_SAVE -.LCTRMM_L4x1_SUB2: +CTRMM_L4x1_SUB2: KERNEL4x1_SUB1 addic. L, L, -1 - bgt .LCTRMM_L4x1_SUB2 + bgt CTRMM_L4x1_SUB2 -.LCTRMM_L4x1_SAVE: +CTRMM_L4x1_SAVE: SAVE4x1 @@ -592,7 +591,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L4x1_END: +CTRMM_L4x1_END: slwi T1, K, 5 add B, B, T1 @@ -603,23 +602,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. J, J, -1 - bgt .LCTRMM_L4_BEGIN + bgt CTRMM_L4_BEGIN andi. T2, N, 3 - ble .L999_H2 + ble L999_H2 -.LCTRMM_L4_END: +CTRMM_L4_END: - b .LCTRMM_L2_BEGIN + b CTRMM_L2_BEGIN -.L999_H1: +L999_H1: - b .L999_H2 + b L999_H2 -.LCTRMM_L2_BEGIN: +CTRMM_L2_BEGIN: andi. T1, N, 2 - ble .LCTRMM_L2_END + ble CTRMM_L2_END mr CO, C mr AO, A slwi T1, LDC , 1 @@ -630,9 +629,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif srawi. I, M, 3 - ble .LCTRMM_L2x8_END + ble CTRMM_L2x8_END -.LCTRMM_L2x8_BEGIN: +CTRMM_L2x8_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -659,11 +658,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L2x8_SUB0 + ble CTRMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L2x8_SUB4 + ble CTRMM_L2x8_SUB4 -.LCTRMM_L2x8_LOOP_START: +CTRMM_L2x8_LOOP_START: LOAD2x8_1 KERNEL2x8_I1 @@ -677,11 +676,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -2 - ble .LCTRMM_L2x8_LOOP_END + ble CTRMM_L2x8_LOOP_END .align 5 -.LCTRMM_L2x8_LOOP: +CTRMM_L2x8_LOOP: KERNEL2x8_1 KERNEL2x8_2 @@ -694,9 +693,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_2 addic. L, L, -1 - bgt .LCTRMM_L2x8_LOOP + bgt CTRMM_L2x8_LOOP -.LCTRMM_L2x8_LOOP_END: +CTRMM_L2x8_LOOP_END: KERNEL2x8_1 KERNEL2x8_2 @@ -708,9 +707,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_1 KERNEL2x8_E2 - b .LCTRMM_L2x8_SUB1 + b CTRMM_L2x8_SUB1 -.LCTRMM_L2x8_SUB4: +CTRMM_L2x8_SUB4: KERNEL2x8_SUBI1 KERNEL2x8_SUB1 @@ -722,31 +721,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b .LCTRMM_L2x8_SUB1 + b CTRMM_L2x8_SUB1 -.LCTRMM_L2x8_SUB0: +CTRMM_L2x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x8_SUBI1 addic. L, L, -1 - ble .LCTRMM_L2x8_SAVE - b .LCTRMM_L2x8_SUB2 + ble CTRMM_L2x8_SAVE + b CTRMM_L2x8_SUB2 -.LCTRMM_L2x8_SUB1: +CTRMM_L2x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L2x8_SAVE + ble CTRMM_L2x8_SAVE -.LCTRMM_L2x8_SUB2: +CTRMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt .LCTRMM_L2x8_SUB2 + bgt CTRMM_L2x8_SUB2 -.LCTRMM_L2x8_SAVE: +CTRMM_L2x8_SAVE: SAVE2x8 @@ -764,16 +763,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. I, I, -1 - bgt .LCTRMM_L2x8_BEGIN + bgt CTRMM_L2x8_BEGIN -.LCTRMM_L2x8_END: +CTRMM_L2x8_END: -.LCTRMM_L2x4_BEGIN: +CTRMM_L2x4_BEGIN: andi. T2, M, 7 - ble .LCTRMM_L2x1_END + ble CTRMM_L2x1_END andi. T1, M, 4 - ble .LCTRMM_L2x4_END + ble CTRMM_L2x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -799,11 +798,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L2x4_SUB0 + ble CTRMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L2x4_SUB4 + ble CTRMM_L2x4_SUB4 -.LCTRMM_L2x4_LOOP_START: +CTRMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -817,11 +816,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -2 - ble .LCTRMM_L2x4_LOOP_END + ble CTRMM_L2x4_LOOP_END .align 5 -.LCTRMM_L2x4_LOOP: +CTRMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -834,9 +833,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_2 addic. L, L, -1 - bgt .LCTRMM_L2x4_LOOP + bgt CTRMM_L2x4_LOOP -.LCTRMM_L2x4_LOOP_END: +CTRMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -848,9 +847,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_1 KERNEL2x4_E2 - b .LCTRMM_L2x4_SUB1 + b CTRMM_L2x4_SUB1 -.LCTRMM_L2x4_SUB4: +CTRMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -862,31 +861,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b .LCTRMM_L2x4_SUB1 + b CTRMM_L2x4_SUB1 -.LCTRMM_L2x4_SUB0: +CTRMM_L2x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x4_SUBI1 addic. L, L, -1 - ble .LCTRMM_L2x4_SAVE - b .LCTRMM_L2x4_SUB2 + ble CTRMM_L2x4_SAVE + b CTRMM_L2x4_SUB2 -.LCTRMM_L2x4_SUB1: +CTRMM_L2x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L2x4_SAVE + ble CTRMM_L2x4_SAVE -.LCTRMM_L2x4_SUB2: +CTRMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt .LCTRMM_L2x4_SUB2 + bgt CTRMM_L2x4_SUB2 -.LCTRMM_L2x4_SAVE: +CTRMM_L2x4_SAVE: SAVE2x4 @@ -903,12 +902,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L2x4_END: +CTRMM_L2x4_END: -.LCTRMM_L2x2_BEGIN: +CTRMM_L2x2_BEGIN: andi. T1, M, 2 - ble .LCTRMM_L2x2_END + ble CTRMM_L2x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -934,11 +933,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L2x2_SUB0 + ble CTRMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L2x2_SUB4 + ble CTRMM_L2x2_SUB4 -.LCTRMM_L2x2_LOOP_START: +CTRMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -952,11 +951,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -2 - ble .LCTRMM_L2x2_LOOP_END + ble CTRMM_L2x2_LOOP_END .align 5 -.LCTRMM_L2x2_LOOP: +CTRMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -969,9 +968,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_2 addic. L, L, -1 - bgt .LCTRMM_L2x2_LOOP + bgt CTRMM_L2x2_LOOP -.LCTRMM_L2x2_LOOP_END: +CTRMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -983,9 +982,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_1 KERNEL2x2_E2 - b .LCTRMM_L2x2_SUB1 + b CTRMM_L2x2_SUB1 -.LCTRMM_L2x2_SUB4: +CTRMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -997,31 +996,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b .LCTRMM_L2x2_SUB1 + b CTRMM_L2x2_SUB1 -.LCTRMM_L2x2_SUB0: +CTRMM_L2x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x2_SUBI1 addic. L, L, -1 - ble .LCTRMM_L2x2_SAVE - b .LCTRMM_L2x2_SUB2 + ble CTRMM_L2x2_SAVE + b CTRMM_L2x2_SUB2 -.LCTRMM_L2x2_SUB1: +CTRMM_L2x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L2x2_SAVE + ble CTRMM_L2x2_SAVE -.LCTRMM_L2x2_SUB2: +CTRMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt .LCTRMM_L2x2_SUB2 + bgt CTRMM_L2x2_SUB2 -.LCTRMM_L2x2_SAVE: +CTRMM_L2x2_SAVE: SAVE2x2 @@ -1038,12 +1037,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L2x2_END: +CTRMM_L2x2_END: -.LCTRMM_L2x1_BEGIN: +CTRMM_L2x1_BEGIN: andi. T1, M, 1 - ble .LCTRMM_L2x1_END + ble CTRMM_L2x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1069,11 +1068,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L2x1_SUB0 + ble CTRMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L2x1_SUB4 + ble CTRMM_L2x1_SUB4 -.LCTRMM_L2x1_LOOP_START: +CTRMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -1087,11 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -2 - ble .LCTRMM_L2x1_LOOP_END + ble CTRMM_L2x1_LOOP_END .align 5 -.LCTRMM_L2x1_LOOP: +CTRMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -1104,9 +1103,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_2 addic. L, L, -1 - bgt .LCTRMM_L2x1_LOOP + bgt CTRMM_L2x1_LOOP -.LCTRMM_L2x1_LOOP_END: +CTRMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -1118,9 +1117,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_1 KERNEL2x1_E2 - b .LCTRMM_L2x1_SUB1 + b CTRMM_L2x1_SUB1 -.LCTRMM_L2x1_SUB4: +CTRMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -1132,31 +1131,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b .LCTRMM_L2x1_SUB1 + b CTRMM_L2x1_SUB1 -.LCTRMM_L2x1_SUB0: +CTRMM_L2x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL2x1_SUBI1 addic. L, L, -1 - ble .LCTRMM_L2x1_SAVE - b .LCTRMM_L2x1_SUB2 + ble CTRMM_L2x1_SAVE + b CTRMM_L2x1_SUB2 -.LCTRMM_L2x1_SUB1: +CTRMM_L2x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L2x1_SAVE + ble CTRMM_L2x1_SAVE -.LCTRMM_L2x1_SUB2: +CTRMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt .LCTRMM_L2x1_SUB2 + bgt CTRMM_L2x1_SUB2 -.LCTRMM_L2x1_SAVE: +CTRMM_L2x1_SAVE: SAVE2x1 @@ -1173,7 +1172,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L2x1_END: +CTRMM_L2x1_END: slwi T1, K, 4 add B, B, T1 @@ -1183,18 +1182,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L2_END: +CTRMM_L2_END: - b .LCTRMM_L1_BEGIN + b CTRMM_L1_BEGIN -.L999_H2: +L999_H2: - b .L999 + b L999 -.LCTRMM_L1_BEGIN: +CTRMM_L1_BEGIN: andi. T1, N, 1 - ble .LCTRMM_L1_END + ble CTRMM_L1_END mr CO, C mr AO, A @@ -1203,9 +1202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif srawi. I, M, 3 - ble .LCTRMM_L1x8_END + ble CTRMM_L1x8_END -.LCTRMM_L1x8_BEGIN: +CTRMM_L1x8_BEGIN: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1232,11 +1231,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L1x8_SUB0 + ble CTRMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L1x8_SUB4 + ble CTRMM_L1x8_SUB4 -.LCTRMM_L1x8_LOOP_START: +CTRMM_L1x8_LOOP_START: LOAD1x8_1 KERNEL1x8_I1 @@ -1250,11 +1249,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -2 - ble .LCTRMM_L1x8_LOOP_END + ble CTRMM_L1x8_LOOP_END .align 5 -.LCTRMM_L1x8_LOOP: +CTRMM_L1x8_LOOP: KERNEL1x8_1 KERNEL1x8_2 @@ -1267,9 +1266,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_2 addic. L, L, -1 - bgt .LCTRMM_L1x8_LOOP + bgt CTRMM_L1x8_LOOP -.LCTRMM_L1x8_LOOP_END: +CTRMM_L1x8_LOOP_END: KERNEL1x8_1 KERNEL1x8_2 @@ -1281,9 +1280,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_1 KERNEL1x8_E2 - b .LCTRMM_L1x8_SUB1 + b CTRMM_L1x8_SUB1 -.LCTRMM_L1x8_SUB4: +CTRMM_L1x8_SUB4: KERNEL1x8_SUBI1 KERNEL1x8_SUB1 @@ -1295,31 +1294,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b .LCTRMM_L1x8_SUB1 + b CTRMM_L1x8_SUB1 -.LCTRMM_L1x8_SUB0: +CTRMM_L1x8_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x8_SUBI1 addic. L, L, -1 - ble .LCTRMM_L1x8_SAVE - b .LCTRMM_L1x8_SUB2 + ble CTRMM_L1x8_SAVE + b CTRMM_L1x8_SUB2 -.LCTRMM_L1x8_SUB1: +CTRMM_L1x8_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L1x8_SAVE + ble CTRMM_L1x8_SAVE -.LCTRMM_L1x8_SUB2: +CTRMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt .LCTRMM_L1x8_SUB2 + bgt CTRMM_L1x8_SUB2 -.LCTRMM_L1x8_SAVE: +CTRMM_L1x8_SAVE: SAVE1x8 @@ -1337,16 +1336,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addic. I, I, -1 - bgt .LCTRMM_L1x8_BEGIN + bgt CTRMM_L1x8_BEGIN -.LCTRMM_L1x8_END: +CTRMM_L1x8_END: -.LCTRMM_L1x4_BEGIN: +CTRMM_L1x4_BEGIN: andi. T2, M, 7 - ble .LCTRMM_L1x1_END + ble CTRMM_L1x1_END andi. T1, M, 4 - ble .LCTRMM_L1x4_END + ble CTRMM_L1x4_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1372,11 +1371,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L1x4_SUB0 + ble CTRMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L1x4_SUB4 + ble CTRMM_L1x4_SUB4 -.LCTRMM_L1x4_LOOP_START: +CTRMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -1390,11 +1389,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -2 - ble .LCTRMM_L1x4_LOOP_END + ble CTRMM_L1x4_LOOP_END .align 5 -.LCTRMM_L1x4_LOOP: +CTRMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -1407,9 +1406,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_2 addic. L, L, -1 - bgt .LCTRMM_L1x4_LOOP + bgt CTRMM_L1x4_LOOP -.LCTRMM_L1x4_LOOP_END: +CTRMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -1421,9 +1420,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_1 KERNEL1x4_E2 - b .LCTRMM_L1x4_SUB1 + b CTRMM_L1x4_SUB1 -.LCTRMM_L1x4_SUB4: +CTRMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -1435,31 +1434,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b .LCTRMM_L1x4_SUB1 + b CTRMM_L1x4_SUB1 -.LCTRMM_L1x4_SUB0: +CTRMM_L1x4_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x4_SUBI1 addic. L, L, -1 - ble .LCTRMM_L1x4_SAVE - b .LCTRMM_L1x4_SUB2 + ble CTRMM_L1x4_SAVE + b CTRMM_L1x4_SUB2 -.LCTRMM_L1x4_SUB1: +CTRMM_L1x4_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L1x4_SAVE + ble CTRMM_L1x4_SAVE -.LCTRMM_L1x4_SUB2: +CTRMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt .LCTRMM_L1x4_SUB2 + bgt CTRMM_L1x4_SUB2 -.LCTRMM_L1x4_SAVE: +CTRMM_L1x4_SAVE: SAVE1x4 @@ -1476,12 +1475,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L1x4_END: +CTRMM_L1x4_END: -.LCTRMM_L1x2_BEGIN: +CTRMM_L1x2_BEGIN: andi. T1, M, 2 - ble .LCTRMM_L1x2_END + ble CTRMM_L1x2_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1507,11 +1506,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L1x2_SUB0 + ble CTRMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L1x2_SUB4 + ble CTRMM_L1x2_SUB4 -.LCTRMM_L1x2_LOOP_START: +CTRMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -1525,11 +1524,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -2 - ble .LCTRMM_L1x2_LOOP_END + ble CTRMM_L1x2_LOOP_END .align 5 -.LCTRMM_L1x2_LOOP: +CTRMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -1542,9 +1541,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_2 addic. L, L, -1 - bgt .LCTRMM_L1x2_LOOP + bgt CTRMM_L1x2_LOOP -.LCTRMM_L1x2_LOOP_END: +CTRMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -1556,9 +1555,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_1 KERNEL1x2_E2 - b .LCTRMM_L1x2_SUB1 + b CTRMM_L1x2_SUB1 -.LCTRMM_L1x2_SUB4: +CTRMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -1570,31 +1569,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b .LCTRMM_L1x2_SUB1 + b CTRMM_L1x2_SUB1 -.LCTRMM_L1x2_SUB0: +CTRMM_L1x2_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x2_SUBI1 addic. L, L, -1 - ble .LCTRMM_L1x2_SAVE - b .LCTRMM_L1x2_SUB2 + ble CTRMM_L1x2_SAVE + b CTRMM_L1x2_SUB2 -.LCTRMM_L1x2_SUB1: +CTRMM_L1x2_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L1x2_SAVE + ble CTRMM_L1x2_SAVE -.LCTRMM_L1x2_SUB2: +CTRMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt .LCTRMM_L1x2_SUB2 + bgt CTRMM_L1x2_SUB2 -.LCTRMM_L1x2_SAVE: +CTRMM_L1x2_SAVE: SAVE1x2 @@ -1611,12 +1610,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L1x2_END: +CTRMM_L1x2_END: -.LCTRMM_L1x1_BEGIN: +CTRMM_L1x1_BEGIN: andi. T1, M, 1 - ble .LCTRMM_L1x1_END + ble CTRMM_L1x1_END #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mr BO, B // B -> BO @@ -1642,11 +1641,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. mr KKK, T1 mr K1, T1 srawi. L, K1, 3 // KTEMP / 8 -> L - ble .LCTRMM_L1x1_SUB0 + ble CTRMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble .LCTRMM_L1x1_SUB4 + ble CTRMM_L1x1_SUB4 -.LCTRMM_L1x1_LOOP_START: +CTRMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -1660,11 +1659,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -2 - ble .LCTRMM_L1x1_LOOP_END + ble CTRMM_L1x1_LOOP_END .align 5 -.LCTRMM_L1x1_LOOP: +CTRMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -1677,9 +1676,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_2 addic. L, L, -1 - bgt .LCTRMM_L1x1_LOOP + bgt CTRMM_L1x1_LOOP -.LCTRMM_L1x1_LOOP_END: +CTRMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -1691,9 +1690,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_1 KERNEL1x1_E2 - b .LCTRMM_L1x1_SUB1 + b CTRMM_L1x1_SUB1 -.LCTRMM_L1x1_SUB4: +CTRMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -1705,31 +1704,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b .LCTRMM_L1x1_SUB1 + b CTRMM_L1x1_SUB1 -.LCTRMM_L1x1_SUB0: +CTRMM_L1x1_SUB0: andi. L, K1, 7 // K1 & 7 -> L KERNEL1x1_SUBI1 addic. L, L, -1 - ble .LCTRMM_L1x1_SAVE - b .LCTRMM_L1x1_SUB2 + ble CTRMM_L1x1_SAVE + b CTRMM_L1x1_SUB2 -.LCTRMM_L1x1_SUB1: +CTRMM_L1x1_SUB1: andi. L, K1, 7 // K1 & 7 -> L - ble .LCTRMM_L1x1_SAVE + ble CTRMM_L1x1_SAVE -.LCTRMM_L1x1_SUB2: +CTRMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt .LCTRMM_L1x1_SUB2 + bgt CTRMM_L1x1_SUB2 -.LCTRMM_L1x1_SAVE: +CTRMM_L1x1_SAVE: SAVE1x1 @@ -1746,11 +1745,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -.LCTRMM_L1x1_END: +CTRMM_L1x1_END: #if !defined(LEFT) addi KK, KK, 1 // KK += Number of values in B #endif -.LCTRMM_L1_END: +CTRMM_L1_END: From c5b1fbcb2ec74dfd2c7cae4a838dda425c3e0af5 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Mon, 4 Apr 2016 09:12:08 +0200 Subject: [PATCH 40/48] updated optimized cgemm- and ctrmm-kernel for POWER8 --- kernel/power/cgemm_kernel_8x4_power8.S | 10 +- kernel/power/cgemm_logic_8x4_power8.S | 145 +- kernel/power/cgemm_macros_8x4_power8.S | 1327 ++--- kernel/power/ctrmm_kernel_8x4_power8.S | 4 +- kernel/power/ctrmm_logic_8x4_power8.S | 16 +- kernel/power/ctrmm_macros_8x4_power8.S | 6794 ++++++++++++++++++++++++ param.h | 2 +- 7 files changed, 7393 insertions(+), 905 deletions(-) create mode 100644 kernel/power/ctrmm_macros_8x4_power8.S diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S index a7e706699..f90069e3f 100644 --- a/kernel/power/cgemm_kernel_8x4_power8.S +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/04/03 Werner Saar (wernsaar@googlemail.com) +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK @@ -137,12 +137,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha_si vs31 -#define NOTUSED r14 +#define BBUFFER r14 #define L r15 #define o12 r16 #define o4 r17 #define T2 r19 -#define KK r20 +#define BBO r20 #define o8 r21 #define I r22 #define J r23 @@ -290,6 +290,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. li o32 , 32 li o48 , 48 + li T1, 256 + slwi T1, T1, 9 // 131072 + sub BBUFFER, A, T1 // temp buffer for B unrolled + #ifdef __64BIT__ addi T1 , SP, 296 diff --git a/kernel/power/cgemm_logic_8x4_power8.S b/kernel/power/cgemm_logic_8x4_power8.S index 851a09aaa..db2a57f91 100644 --- a/kernel/power/cgemm_logic_8x4_power8.S +++ b/kernel/power/cgemm_logic_8x4_power8.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/04/03 Werner Saar (wernsaar@googlemail.com) +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK @@ -38,6 +38,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. CGEMM_L4_BEGIN: + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 3 + +CGEMM_L4_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge CGEMM_L4_COPYB + + mr CO, C mr AO, A slwi T1, LDC , 2 @@ -48,7 +81,7 @@ CGEMM_L4_BEGIN: CGEMM_L4x8_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L4x8_SUB0 cmpwi cr0, L, 1 @@ -59,18 +92,25 @@ CGEMM_L4x8_LOOP_START: dcbt AO, PRE dcbt BO, PRE LOAD4x8_1 + dcbt BO, PRE KERNEL4x8_I1 + dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 + dcbt BO, PRE KERNEL4x8_1 + dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 + dcbt BO, PRE KERNEL4x8_1 - dcbt AO, PRE dcbt BO, PRE + dcbt AO, PRE KERNEL4x8_2 + dcbt BO, PRE KERNEL4x8_1 + dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 @@ -81,18 +121,25 @@ CGEMM_L4x8_LOOP_START: CGEMM_L4x8_LOOP: + dcbt BO, PRE KERNEL4x8_1 + dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 + dcbt BO, PRE KERNEL4x8_1 + dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 + dcbt BO, PRE KERNEL4x8_1 - dcbt AO, PRE dcbt BO, PRE + dcbt AO, PRE KERNEL4x8_2 + dcbt BO, PRE KERNEL4x8_1 + dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 @@ -101,7 +148,9 @@ CGEMM_L4x8_LOOP: CGEMM_L4x8_LOOP_END: + dcbt BO, PRE KERNEL4x8_1 + dcbt BO, PRE dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 @@ -168,7 +217,7 @@ CGEMM_L4x4_BEGIN: andi. T1, M, 4 ble CGEMM_L4x4_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L4x4_SUB0 cmpwi cr0, L, 1 @@ -268,7 +317,7 @@ CGEMM_L4x2_BEGIN: andi. T1, M, 2 ble CGEMM_L4x2_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L4x2_SUB0 cmpwi cr0, L, 1 @@ -368,7 +417,7 @@ CGEMM_L4x1_BEGIN: andi. T1, M, 1 ble CGEMM_L4x1_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L4x1_SUB0 cmpwi cr0, L, 1 @@ -482,6 +531,39 @@ L999_H1: CGEMM_L2_BEGIN: + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 2 + +CGEMM_L2_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge CGEMM_L2_COPYB + + andi. T1, N, 2 ble CGEMM_L2_END mr CO, C @@ -494,7 +576,7 @@ CGEMM_L2_BEGIN: CGEMM_L2x8_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L2x8_SUB0 cmpwi cr0, L, 1 @@ -611,7 +693,7 @@ CGEMM_L2x4_BEGIN: andi. T1, M, 4 ble CGEMM_L2x4_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L2x4_SUB0 cmpwi cr0, L, 1 @@ -711,7 +793,7 @@ CGEMM_L2x2_BEGIN: andi. T1, M, 2 ble CGEMM_L2x2_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L2x2_SUB0 cmpwi cr0, L, 1 @@ -811,7 +893,7 @@ CGEMM_L2x1_BEGIN: andi. T1, M, 1 ble CGEMM_L2x1_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L2x1_SUB0 cmpwi cr0, L, 1 @@ -919,6 +1001,39 @@ L999_H2: CGEMM_L1_BEGIN: + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 1 + +CGEMM_L1_COPYB: + dcbtst BBO, PRE + + lxvw4x vs3, o0, BO + lxvw4x vs11, o16, BO + xxspltw vs4, vs3, 0 + xxspltw vs5, vs3, 1 + xxspltw vs6, vs3, 2 + xxspltw vs7, vs3, 3 + xxspltw vs12, vs11, 0 + xxspltw vs13, vs11, 1 + xxspltw vs14, vs11, 2 + xxspltw vs15, vs11, 3 + stxvw4x vs4, o0, BBO + stxvw4x vs5, o16, BBO + stxvw4x vs6, o32, BBO + stxvw4x vs7, o48, BBO + addi BO, BO, 32 + addi BBO, BBO, 64 + stxvw4x vs12, o0, BBO + stxvw4x vs13, o16, BBO + stxvw4x vs14, o32, BBO + stxvw4x vs15, o48, BBO + addic. T1, T1, -8 + addi BBO, BBO, 64 + + bge CGEMM_L1_COPYB + + andi. T1, N, 1 ble CGEMM_L1_END mr CO, C @@ -929,7 +1044,7 @@ CGEMM_L1_BEGIN: CGEMM_L1x8_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L1x8_SUB0 cmpwi cr0, L, 1 @@ -1046,7 +1161,7 @@ CGEMM_L1x4_BEGIN: andi. T1, M, 4 ble CGEMM_L1x4_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L1x4_SUB0 cmpwi cr0, L, 1 @@ -1146,7 +1261,7 @@ CGEMM_L1x2_BEGIN: andi. T1, M, 2 ble CGEMM_L1x2_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L1x2_SUB0 cmpwi cr0, L, 1 @@ -1246,7 +1361,7 @@ CGEMM_L1x1_BEGIN: andi. T1, M, 1 ble CGEMM_L1x1_END - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 ble CGEMM_L1x1_SUB0 cmpwi cr0, L, 1 diff --git a/kernel/power/cgemm_macros_8x4_power8.S b/kernel/power/cgemm_macros_8x4_power8.S index 48a21252c..9a18cb189 100644 --- a/kernel/power/cgemm_macros_8x4_power8.S +++ b/kernel/power/cgemm_macros_8x4_power8.S @@ -86,66 +86,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD4x8_1 lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - .endm .macro KERNEL4x8_I1 lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - lxvw4x vs6, o32, AO // load a4, a5 - lxvw4x vs7, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i - xxspltw vs20, vs25, 0 - xxspltw vs21, vs25, 1 - xxspltw vs22, vs25, 2 - xxspltw vs23, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -190,33 +178,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - lxvw4x vs6, o32, AO // load a4, a5 - lxvw4x vs7, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i - xxspltw vs20, vs25, 0 - xxspltw vs21, vs25, 1 - xxspltw vs22, vs25, 2 - xxspltw vs23, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -261,33 +243,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -374,33 +350,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -445,33 +415,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -515,6 +479,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x8 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -571,7 +536,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -637,7 +601,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -703,7 +666,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -769,7 +731,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -841,7 +802,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -907,7 +867,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -973,7 +932,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1039,7 +997,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1111,7 +1068,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1177,7 +1133,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1243,7 +1198,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1309,7 +1263,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1381,7 +1334,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1447,7 +1399,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1513,7 +1464,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1579,7 +1529,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -1607,57 +1556,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD4x4_1 lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - .endm .macro KERNEL4x4_I1 lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs20, vs25, 0 - xxspltw vs21, vs25, 1 - xxspltw vs22, vs25, 2 - xxspltw vs23, vs25, 3 + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -1687,28 +1628,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs20, vs25, 0 - xxspltw vs21, vs25, 1 - xxspltw vs22, vs25, 2 - xxspltw vs23, vs25, 3 + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -1738,29 +1675,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -1815,28 +1748,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -1866,28 +1795,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -1916,6 +1841,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x4 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -1972,7 +1898,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2038,7 +1963,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2110,7 +2034,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2176,7 +2099,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2248,7 +2170,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2314,7 +2235,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2386,7 +2306,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2452,7 +2371,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2481,25 +2399,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 .endm @@ -2508,25 +2423,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs20, vs25, 0 - xxspltw vs21, vs25, 1 - xxspltw vs22, vs25, 2 - xxspltw vs23, vs25, 3 + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -2549,25 +2461,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs20, vs25, 0 - xxspltw vs21, vs25, 1 - xxspltw vs22, vs25, 2 - xxspltw vs23, vs25, 3 + lxvw4x vs20, o0, BO // load b2_r + lxvw4x vs21, o16, BO // load b2_i + lxvw4x vs22, o32, BO // load b3_r + lxvw4x vs23, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -2590,26 +2499,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - lxvw4x vs25, o16, BO // load b2, b3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + addi BO, BO, 64 - addi BO, BO, 32 - xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -2649,25 +2555,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -2690,25 +2593,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - lxvw4x vs25, o16, BO // load b2, b3 + addi BO, BO, 64 - xxspltw vs12, vs25, 0 - xxspltw vs13, vs25, 1 - xxspltw vs14, vs25, 2 - xxspltw vs15, vs25, 3 + lxvw4x vs12, o0, BO // load b2_r + lxvw4x vs13, o16, BO // load b2_i + lxvw4x vs14, o32, BO // load b3_r + lxvw4x vs15, o48, BO // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -2729,6 +2629,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x2 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -2785,7 +2686,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2857,7 +2757,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -2929,7 +2828,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -3001,7 +2899,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -3033,27 +2930,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i + addi BO, BO, 64 - addi T1, T1,8 + lxsspx vs12, o0, BO // load b2_r + lxsspx vs13, o16, BO // load b2_i + lxsspx vs14, o32, BO // load b3_r + lxsspx vs15, o48, BO // load b3_i - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i + addi BO, BO, 64 - addi T1, T1,8 - - lxsspx vs12, o0, T1 // load b2_r - lxsspx vs13, o4, T1 // load b2_i - - addi T1, T1,8 - - lxsspx vs14, o0, T1 // load b3_r - lxsspx vs15, o4, T1 // load b3_i - - addi BO, BO, 32 .endm @@ -3065,27 +2955,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs16, o0, T1 // load b0_r - lxsspx vs17, o4, T1 // load b0_i + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i + lxsspx vs18, o32, BO // load b1_r + lxsspx vs19, o48, BO // load b1_i - addi T1, T1,8 + addi BO, BO, 64 - lxsspx vs18, o0, T1 // load b1_r - lxsspx vs19, o4, T1 // load b1_i + lxsspx vs20, o0, BO // load b2_r + lxsspx vs21, o16, BO // load b2_i + lxsspx vs22, o32, BO // load b3_r + lxsspx vs23, o48, BO // load b3_i - addi T1, T1,8 + addi BO, BO, 64 - lxsspx vs20, o0, T1 // load b2_r - lxsspx vs21, o4, T1 // load b2_i - - addi T1, T1,8 - - lxsspx vs22, o0, T1 // load b3_r - lxsspx vs23, o4, T1 // load b3_i - - addi BO, BO, 32 xsmuldp vs32, vs0, vs8 // a0_r*b0_r @@ -3119,27 +3002,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs16, o0, T1 // load b0_r - lxsspx vs17, o4, T1 // load b0_i - - addi T1, T1,8 + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i + lxsspx vs18, o32, BO // load b1_r + lxsspx vs19, o48, BO // load b1_i - lxsspx vs18, o0, T1 // load b1_r - lxsspx vs19, o4, T1 // load b1_i + addi BO, BO, 64 - addi T1, T1,8 + lxsspx vs20, o0, BO // load b2_r + lxsspx vs21, o16, BO // load b2_i + lxsspx vs22, o32, BO // load b3_r + lxsspx vs23, o48, BO // load b3_i - lxsspx vs20, o0, T1 // load b2_r - lxsspx vs21, o4, T1 // load b2_i + addi BO, BO, 64 - addi T1, T1,8 - - lxsspx vs22, o0, T1 // load b3_r - lxsspx vs23, o4, T1 // load b3_i - - addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r @@ -3173,27 +3049,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i - - addi T1, T1,8 - - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - addi T1, T1,8 + addi BO, BO, 64 - lxsspx vs12, o0, T1 // load b2_r - lxsspx vs13, o4, T1 // load b2_i + lxsspx vs12, o0, BO // load b2_r + lxsspx vs13, o16, BO // load b2_i + lxsspx vs14, o32, BO // load b3_r + lxsspx vs15, o48, BO // load b3_i - addi T1, T1,8 + addi BO, BO, 64 - lxsspx vs14, o0, T1 // load b3_r - lxsspx vs15, o4, T1 // load b3_i - - addi BO, BO, 32 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -3253,27 +3122,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i - - addi T1, T1,8 - - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i - - addi T1, T1,8 + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - lxsspx vs12, o0, T1 // load b2_r - lxsspx vs13, o4, T1 // load b2_i + addi BO, BO, 64 - addi T1, T1,8 + lxsspx vs12, o0, BO // load b2_r + lxsspx vs13, o16, BO // load b2_i + lxsspx vs14, o32, BO // load b3_r + lxsspx vs15, o48, BO // load b3_i - lxsspx vs14, o0, T1 // load b3_r - lxsspx vs15, o4, T1 // load b3_i + addi BO, BO, 64 - addi BO, BO, 32 xsmuldp vs32, vs0, vs8 // a0_r*b0_r @@ -3307,27 +3169,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i - - addi T1, T1,8 + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i + addi BO, BO, 64 - addi T1, T1,8 + lxsspx vs12, o0, BO // load b2_r + lxsspx vs13, o16, BO // load b2_i + lxsspx vs14, o32, BO // load b3_r + lxsspx vs15, o48, BO // load b3_i - lxsspx vs12, o0, T1 // load b2_r - lxsspx vs13, o4, T1 // load b2_i + addi BO, BO, 64 - addi T1, T1,8 - - lxsspx vs14, o0, T1 // load b3_r - lxsspx vs15, o4, T1 // load b3_i - - addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r @@ -3356,6 +3211,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE4x1 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -3536,25 +3392,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD2x8_1 lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 .endm @@ -3562,25 +3412,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - lxvw4x vs6, o32, AO // load a4, a5 - lxvw4x vs7, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -3608,26 +3452,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - lxvw4x vs6, o32, AO // load a4, a5 - lxvw4x vs7, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -3654,25 +3492,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -3724,25 +3556,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -3770,26 +3596,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -3815,6 +3635,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x8 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -3871,7 +3692,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -3937,7 +3757,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4003,7 +3822,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4069,7 +3887,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4141,7 +3958,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4207,7 +4023,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4273,7 +4088,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4339,7 +4153,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4367,44 +4180,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD2x4_1 lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - .endm .macro KERNEL2x4_I1 lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -4423,22 +4228,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -4457,22 +4258,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -4507,22 +4304,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -4541,22 +4334,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -4574,6 +4363,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x4 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -4630,7 +4420,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4696,7 +4485,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4768,7 +4556,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4834,7 +4621,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -4863,18 +4649,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 .endm @@ -4883,19 +4666,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -4911,18 +4691,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i + lxvw4x vs18, o32, BO // load b1_r + lxvw4x vs19, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4939,18 +4716,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4979,19 +4753,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + addi BO, BO, 64 - addi BO, BO, 16 - xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i @@ -5007,18 +4778,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i + lxvw4x vs10, o32, BO // load b1_r + lxvw4x vs11, o48, BO // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5033,6 +4801,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x2 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -5089,7 +4858,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -5161,7 +4929,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -5193,17 +4960,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i + addi BO, BO, 64 - addi T1, T1,8 - - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i - - addi BO, BO, 16 .endm @@ -5215,17 +4978,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs16, o0, T1 // load b0_r - lxsspx vs17, o4, T1 // load b0_i + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i + lxsspx vs18, o32, BO // load b1_r + lxsspx vs19, o48, BO // load b1_i - addi T1, T1,8 + addi BO, BO, 64 - lxsspx vs18, o0, T1 // load b1_r - lxsspx vs19, o4, T1 // load b1_i - - addi BO, BO, 16 xsmuldp vs32, vs0, vs8 // a0_r*b0_r @@ -5249,17 +5008,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs16, o0, T1 // load b0_r - lxsspx vs17, o4, T1 // load b0_i - - addi T1, T1,8 + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i + lxsspx vs18, o32, BO // load b1_r + lxsspx vs19, o48, BO // load b1_i - lxsspx vs18, o0, T1 // load b1_r - lxsspx vs19, o4, T1 // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r @@ -5283,17 +5038,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i + addi BO, BO, 64 - addi T1, T1,8 - - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i - - addi BO, BO, 16 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -5333,17 +5084,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - addi T1, T1,8 + addi BO, BO, 64 - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i - - addi BO, BO, 16 xsmuldp vs32, vs0, vs8 // a0_r*b0_r @@ -5367,17 +5114,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i - - addi T1, T1,8 + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i + lxsspx vs10, o32, BO // load b1_r + lxsspx vs11, o48, BO // load b1_i - lxsspx vs10, o0, T1 // load b1_r - lxsspx vs11, o4, T1 // load b1_i + addi BO, BO, 64 - addi BO, BO, 16 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r @@ -5396,6 +5139,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE2x1 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -5492,27 +5236,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD1x8_1 lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 .endm @@ -5520,27 +5253,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - lxvw4x vs6, o32, AO // load a4, a5 - lxvw4x vs7, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - - - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 - + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5559,27 +5281,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - lxvw4x vs6, o32, AO // load a4, a5 - lxvw4x vs7, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5598,27 +5309,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - - - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5652,27 +5352,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5691,27 +5380,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - lxvw4x vs2, o32, AO // load a4, a5 - lxvw4x vs3, o48, AO // load a6, a7 - addi AO, AO, 64 - lxvw4x vs24, o0, BO // load b0, b1 - - - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5729,6 +5407,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE1x8 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -5785,7 +5464,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -5851,7 +5529,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -5917,7 +5594,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -5983,7 +5659,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -6011,23 +5686,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD1x4_1 lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 .endm @@ -6035,23 +5701,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - - - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i - - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6066,23 +5723,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - lxvw4x vs5, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6097,23 +5745,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6139,23 +5778,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - - - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6170,23 +5800,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - lxvw4x vs1, o16, AO // load a2, a3 - addi AO, AO, 32 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6200,6 +5821,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE1x4 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -6256,7 +5878,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -6322,7 +5943,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -6351,20 +5971,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 .endm @@ -6373,20 +5985,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6400,20 +6004,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs4, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs16, o0, BO // load b0_r + lxvw4x vs17, o16, BO // load b0_i - xxspltw vs16, vs24, 0 - xxspltw vs17, vs24, 1 - xxspltw vs18, vs24, 2 - xxspltw vs19, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6427,20 +6023,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6463,20 +6051,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6490,20 +6070,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvw4x vs0, o0, AO // load a0, a1 - addi AO, AO, 16 - lxvw4x vs24, o0, BO // load b0, b1 - - + lxvw4x vs8, o0, BO // load b0_r + lxvw4x vs9, o16, BO // load b0_i - xxspltw vs8, vs24, 0 - xxspltw vs9, vs24, 1 - xxspltw vs10, vs24, 2 - xxspltw vs11, vs24, 3 - - - addi BO, BO, 8 + addi BO, BO, 32 xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6515,6 +6087,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE1x2 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 @@ -6571,7 +6144,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r - xxlxor vs24, vs24, vs24 xxsldwi vs20, vs20, vs24, 3 // r0_r xxsldwi vs21, vs21, vs24, 2 // r0_i xxsldwi vs22, vs22, vs24, 1 // r1_r @@ -6603,12 +6175,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 .endm @@ -6620,12 +6190,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs16, o0, T1 // load b0_r - lxsspx vs17, o4, T1 // load b0_i + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 xsmuldp vs32, vs0, vs8 // a0_r*b0_r @@ -6644,12 +6212,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO + lxsspx vs16, o0, BO // load b0_r + lxsspx vs17, o16, BO // load b0_i - lxsspx vs16, o0, T1 // load b0_r - lxsspx vs17, o4, T1 // load b0_i - - addi BO, BO, 8 + addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r @@ -6668,12 +6234,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -6703,12 +6267,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO - - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i - addi BO, BO, 8 + addi BO, BO, 32 xsmuldp vs32, vs0, vs8 // a0_r*b0_r @@ -6727,12 +6289,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 - mr T1, BO + lxsspx vs8, o0, BO // load b0_r + lxsspx vs9, o16, BO // load b0_i - lxsspx vs8, o0, T1 // load b0_r - lxsspx vs9, o4, T1 // load b0_i - - addi BO, BO, 8 + addi BO, BO, 32 xsmaddadp vs32, vs0, vs8 // a0_r*b0_r @@ -6746,6 +6306,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro SAVE1x1 mr T1, CO + xxlxor vs24, vs24, vs24 // N=0 diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S index b202114dd..460a387fb 100644 --- a/kernel/power/ctrmm_kernel_8x4_power8.S +++ b/kernel/power/ctrmm_kernel_8x4_power8.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/04/03 Werner Saar (wernsaar@googlemail.com) +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK @@ -275,7 +275,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#include "cgemm_macros_8x4_power8.S" +#include "ctrmm_macros_8x4_power8.S" cmpwi cr0, M, 0 ble L999_H1 diff --git a/kernel/power/ctrmm_logic_8x4_power8.S b/kernel/power/ctrmm_logic_8x4_power8.S index 3e50646b0..9ab258501 100644 --- a/kernel/power/ctrmm_logic_8x4_power8.S +++ b/kernel/power/ctrmm_logic_8x4_power8.S @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /************************************************************************************** -* 2016/04/03 Werner Saar (wernsaar@googlemail.com) +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) * BLASTEST : OK * CTEST : OK * TEST : OK @@ -83,15 +83,22 @@ CTRMM_L4x8_BEGIN: CTRMM_L4x8_LOOP_START: + dcbt AO, PRE + dcbt BO, PRE LOAD4x8_1 KERNEL4x8_I1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE + dcbt BO, PRE KERNEL4x8_2 addic. L, L, -2 @@ -102,13 +109,18 @@ CTRMM_L4x8_LOOP_START: CTRMM_L4x8_LOOP: KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE + dcbt BO, PRE KERNEL4x8_2 addic. L, L, -1 @@ -117,8 +129,10 @@ CTRMM_L4x8_LOOP: CTRMM_L4x8_LOOP_END: KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 + dcbt AO, PRE KERNEL4x8_2 KERNEL4x8_1 diff --git a/kernel/power/ctrmm_macros_8x4_power8.S b/kernel/power/ctrmm_macros_8x4_power8.S new file mode 100644 index 000000000..48a21252c --- /dev/null +++ b/kernel/power/ctrmm_macros_8x4_power8.S @@ -0,0 +1,6794 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/04/04 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xsadddp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvsubsp + #define XVFADD_I1 xvaddsp + #define XVFADD_I2 xvaddsp + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xsadddp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvaddsp + #define XVFADD_I1 xvsubsp + #define XVFADD_I2 xvaddsp + +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xssubdp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvaddsp + #define XVFADD_I1 xvaddsp + #define XVFADD_I2 xvsubsp + +#else // CC || CR || RC || RR + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xssubdp + #define XVFADD_R1 xvaddsp + #define XVFADD_R2 xvsubsp + #define XVFADD_I1 xvsubsp + #define XVFADD_I2 xvsubsp + +#endif + +/********************************************************************************************** +* Macros for N=4 and M=8 +**********************************************************************************************/ + +.macro LOAD4x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs6, vs20 // a6_r*b2_r, a6_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs6, vs21 // a6_r*b2_i, a6_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs7, vs20 // a7_r*b2_r, a7_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs7, vs21 // a7_r*b2_i, a7_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs6, vs22 // a6_r*b3_r, a6_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs6, vs23 // a6_r*b3_i, a6_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs7, vs22 // a7_r*b3_r, a7_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs48, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs49, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs50, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs51, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs52, vs2, vs12 // a2_r*b2_r, a2_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs53, vs2, vs13 // a2_r*b2_i, a2_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs54, vs3, vs12 // a3_r*b2_r, a3_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs55, vs3, vs13 // a3_r*b2_i, a3_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs56, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs57, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs58, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs59, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs60, vs2, vs14 // a2_r*b3_r, a2_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs61, vs2, vs15 // a2_r*b3_i, a2_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs62, vs3, vs14 // a3_r*b3_r, a3_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x8 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 + + + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 + + + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 + + + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 + + + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs48, 0 + xxspltw vs9, vs48, 1 + xxspltw vs10, vs48, 2 + xxspltw vs11, vs48, 3 + + + xxspltw vs12, vs49, 0 + xxspltw vs13, vs49, 1 + xxspltw vs14, vs49, 2 + xxspltw vs15, vs49, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs50, 0 + xxspltw vs9, vs50, 1 + xxspltw vs10, vs50, 2 + xxspltw vs11, vs50, 3 + + + xxspltw vs12, vs51, 0 + xxspltw vs13, vs51, 1 + xxspltw vs14, vs51, 2 + xxspltw vs15, vs51, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs52, 0 + xxspltw vs9, vs52, 1 + xxspltw vs10, vs52, 2 + xxspltw vs11, vs52, 3 + + + xxspltw vs12, vs53, 0 + xxspltw vs13, vs53, 1 + xxspltw vs14, vs53, 2 + xxspltw vs15, vs53, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs54, 0 + xxspltw vs9, vs54, 1 + xxspltw vs10, vs54, 2 + xxspltw vs11, vs54, 3 + + + xxspltw vs12, vs55, 0 + xxspltw vs13, vs55, 1 + xxspltw vs14, vs55, 2 + xxspltw vs15, vs55, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs56, 0 + xxspltw vs9, vs56, 1 + xxspltw vs10, vs56, 2 + xxspltw vs11, vs56, 3 + + + xxspltw vs12, vs57, 0 + xxspltw vs13, vs57, 1 + xxspltw vs14, vs57, 2 + xxspltw vs15, vs57, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs58, 0 + xxspltw vs9, vs58, 1 + xxspltw vs10, vs58, 2 + xxspltw vs11, vs58, 3 + + + xxspltw vs12, vs59, 0 + xxspltw vs13, vs59, 1 + xxspltw vs14, vs59, 2 + xxspltw vs15, vs59, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs60, 0 + xxspltw vs9, vs60, 1 + xxspltw vs10, vs60, 2 + xxspltw vs11, vs60, 3 + + + xxspltw vs12, vs61, 0 + xxspltw vs13, vs61, 1 + xxspltw vs14, vs61, 2 + xxspltw vs15, vs61, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs62, 0 + xxspltw vs9, vs62, 1 + xxspltw vs10, vs62, 2 + xxspltw vs11, vs62, 3 + + + xxspltw vs12, vs63, 0 + xxspltw vs13, vs63, 1 + xxspltw vs14, vs63, 2 + xxspltw vs15, vs63, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=4 +**********************************************************************************************/ + +.macro LOAD4x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs5, vs20 // a5_r*b2_r, a5_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs5, vs21 // a5_r*b2_i, a5_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs5, vs22 // a5_r*b3_r, a5_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmulsp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmulsp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs40, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs41, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + xvmaddasp vs42, vs1, vs12 // a1_r*b2_r, a1_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs43, vs1, vs13 // a1_r*b2_i, a1_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs44, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs45, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + xvmaddasp vs46, vs1, vs14 // a1_r*b3_r, a1_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x4 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 + + + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=2 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 + + + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 + + + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=3 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 + + + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=2 +**********************************************************************************************/ + +.macro LOAD4x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs20, vs25, 0 + xxspltw vs21, vs25, 1 + xxspltw vs22, vs25, 2 + xxspltw vs23, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs4, vs20 // a4_r*b2_r, a4_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs4, vs21 // a4_r*b2_i, a4_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs4, vs22 // a4_r*b3_r, a4_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmulsp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmulsp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmulsp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro KERNEL4x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + lxvw4x vs25, o16, BO // load b2, b3 + + xxspltw vs12, vs25, 0 + xxspltw vs13, vs25, 1 + xxspltw vs14, vs25, 2 + xxspltw vs15, vs25, 3 + + + addi BO, BO, 32 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + xvmaddasp vs36, vs0, vs12 // a0_r*b2_r, a0_i*b2_r, a1_r*b2_r, a1_i*b2_r + xvmaddasp vs37, vs0, vs13 // a0_r*b2_i, a0_i*b2_i, a1_r*b2_i, a1_i*b2_i + + xvmaddasp vs38, vs0, vs14 // a0_r*b3_r, a0_i*b3_r, a1_r*b3_r, a1_i*b3_r + xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i + + +.endm + +.macro SAVE4x2 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=4 and M=1 +**********************************************************************************************/ + +.macro LOAD4x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + +.endm + +.macro KERNEL4x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs20, o0, T1 // load b2_r + lxsspx vs21, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs22, o0, T1 // load b3_r + lxsspx vs23, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + xsmuldp vs40, vs0, vs12 // a0_r*b2_r + xsmuldp vs41, vs1, vs13 // a0_i*b2_i + xsmuldp vs42, vs0, vs13 // a0_r*b2_i + xsmuldp vs43, vs1, vs12 // a0_i*b2_r + + xsmuldp vs44, vs0, vs14 // a0_r*b3_r + xsmuldp vs45, vs1, vs15 // a0_i*b3_i + xsmuldp vs46, vs0, vs15 // a0_r*b3_i + xsmuldp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs20, o0, T1 // load b2_r + lxsspx vs21, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs22, o0, T1 // load b3_r + lxsspx vs23, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + xsmaddadp vs40, vs0, vs12 // a0_r*b2_r + xsmaddadp vs41, vs1, vs13 // a0_i*b2_i + xsmaddadp vs42, vs0, vs13 // a0_r*b2_i + xsmaddadp vs43, vs1, vs12 // a0_i*b2_r + + xsmaddadp vs44, vs0, vs14 // a0_r*b3_r + xsmaddadp vs45, vs1, vs15 // a0_i*b3_i + xsmaddadp vs46, vs0, vs15 // a0_r*b3_i + xsmaddadp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + xsmaddadp vs40, vs4, vs20 // a4_r*b2_r + xsmaddadp vs41, vs5, vs21 // a4_i*b2_i + xsmaddadp vs42, vs4, vs21 // a4_r*b2_i + xsmaddadp vs43, vs5, vs20 // a4_i*b2_r + + xsmaddadp vs44, vs4, vs22 // a4_r*b3_r + xsmaddadp vs45, vs5, vs23 // a4_i*b3_i + xsmaddadp vs46, vs4, vs23 // a4_r*b3_i + xsmaddadp vs47, vs5, vs22 // a4_i*b3_r + + +.endm + +.macro KERNEL4x1_E2 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + xsmaddadp vs40, vs4, vs20 // a4_r*b2_r + xsmaddadp vs41, vs5, vs21 // a4_i*b2_i + xsmaddadp vs42, vs4, vs21 // a4_r*b2_i + xsmaddadp vs43, vs5, vs20 // a4_i*b2_r + + xsmaddadp vs44, vs4, vs22 // a4_r*b3_r + xsmaddadp vs45, vs5, vs23 // a4_i*b3_i + xsmaddadp vs46, vs4, vs23 // a4_r*b3_i + xsmaddadp vs47, vs5, vs22 // a4_i*b3_r + + +.endm + +.macro KERNEL4x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + xsmuldp vs40, vs0, vs12 // a0_r*b2_r + xsmuldp vs41, vs1, vs13 // a0_i*b2_i + xsmuldp vs42, vs0, vs13 // a0_r*b2_i + xsmuldp vs43, vs1, vs12 // a0_i*b2_r + + xsmuldp vs44, vs0, vs14 // a0_r*b3_r + xsmuldp vs45, vs1, vs15 // a0_i*b3_i + xsmuldp vs46, vs0, vs15 // a0_r*b3_i + xsmuldp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro KERNEL4x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi T1, T1,8 + + lxsspx vs12, o0, T1 // load b2_r + lxsspx vs13, o4, T1 // load b2_i + + addi T1, T1,8 + + lxsspx vs14, o0, T1 // load b3_r + lxsspx vs15, o4, T1 // load b3_i + + addi BO, BO, 32 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + xsmaddadp vs40, vs0, vs12 // a0_r*b2_r + xsmaddadp vs41, vs1, vs13 // a0_i*b2_i + xsmaddadp vs42, vs0, vs13 // a0_r*b2_i + xsmaddadp vs43, vs1, vs12 // a0_i*b2_r + + xsmaddadp vs44, vs0, vs14 // a0_r*b3_r + xsmaddadp vs45, vs1, vs15 // a0_i*b3_i + xsmaddadp vs46, vs0, vs15 // a0_r*b3_i + xsmaddadp vs47, vs1, vs14 // a0_i*b3_r + + +.endm + +.macro SAVE4x1 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=2 + + mr T2, T1 + +// N=2 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs40 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs43 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs41 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs42 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=3 + + mr T2, T1 + +// N=3 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs44 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs47 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs45 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs46 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs6, vs18 // a6_r*b1_r, a6_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs6, vs19 // a6_r*b1_i, a6_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs7, vs18 // a7_r*b1_r, a7_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs40, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs41, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs42, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs43, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs44, vs2, vs10 // a2_r*b1_r, a2_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs45, vs2, vs11 // a2_r*b1_i, a2_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs46, vs3, vs10 // a3_r*b1_r, a3_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x8 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs40, 0 + xxspltw vs9, vs40, 1 + xxspltw vs10, vs40, 2 + xxspltw vs11, vs40, 3 + + + xxspltw vs12, vs41, 0 + xxspltw vs13, vs41, 1 + xxspltw vs14, vs41, 2 + xxspltw vs15, vs41, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs42, 0 + xxspltw vs9, vs42, 1 + xxspltw vs10, vs42, 2 + xxspltw vs11, vs42, 3 + + + xxspltw vs12, vs43, 0 + xxspltw vs13, vs43, 1 + xxspltw vs14, vs43, 2 + xxspltw vs15, vs43, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs44, 0 + xxspltw vs9, vs44, 1 + xxspltw vs10, vs44, 2 + xxspltw vs11, vs44, 3 + + + xxspltw vs12, vs45, 0 + xxspltw vs13, vs45, 1 + xxspltw vs14, vs45, 2 + xxspltw vs15, vs45, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs46, 0 + xxspltw vs9, vs46, 1 + xxspltw vs10, vs46, 2 + xxspltw vs11, vs46, 3 + + + xxspltw vs12, vs47, 0 + xxspltw vs13, vs47, 1 + xxspltw vs14, vs47, 2 + xxspltw vs15, vs47, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs5, vs18 // a5_r*b1_r, a5_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmulsp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs36, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs37, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + xvmaddasp vs38, vs1, vs10 // a1_r*b1_r, a1_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x4 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=1 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs4, vs18 // a4_r*b1_r, a4_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmulsp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro KERNEL2x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 16 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + xvmaddasp vs34, vs0, vs10 // a0_r*b1_r, a0_i*b1_r, a1_r*b1_r, a1_i*b1_r + xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i + + +.endm + +.macro SAVE2x2 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + +.endm + +.macro KERNEL2x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs18, o0, T1 // load b1_r + lxsspx vs19, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + +.endm + +.macro KERNEL2x1_E2 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + xsmaddadp vs36, vs4, vs18 // a4_r*b1_r + xsmaddadp vs37, vs5, vs19 // a4_i*b1_i + xsmaddadp vs38, vs4, vs19 // a4_r*b1_i + xsmaddadp vs39, vs5, vs18 // a4_i*b1_r + + +.endm + +.macro KERNEL2x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + xsmuldp vs36, vs0, vs10 // a0_r*b1_r + xsmuldp vs37, vs1, vs11 // a0_i*b1_i + xsmuldp vs38, vs0, vs11 // a0_r*b1_i + xsmuldp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro KERNEL2x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi T1, T1,8 + + lxsspx vs10, o0, T1 // load b1_r + lxsspx vs11, o4, T1 // load b1_i + + addi BO, BO, 16 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + xsmaddadp vs36, vs0, vs10 // a0_r*b1_r + xsmaddadp vs37, vs1, vs11 // a0_i*b1_i + xsmaddadp vs38, vs0, vs11 // a0_r*b1_i + xsmaddadp vs39, vs1, vs10 // a0_i*b1_r + + +.endm + +.macro SAVE2x1 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + +// N=1 + + mr T2, T1 + +// N=1 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs36 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs39 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs37 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs38 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x8_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + lxvw4x vs6, o32, AO // load a4, a5 + + lxvw4x vs7, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs6, vs16 // a6_r*b0_r, a6_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs6, vs17 // a6_r*b0_i, a6_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs7, vs16 // a7_r*b0_r, a7_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x8_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + lxvw4x vs2, o32, AO // load a4, a5 + + lxvw4x vs3, o48, AO // load a6, a7 + + + addi AO, AO, 64 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs36, vs2, vs8 // a2_r*b0_r, a2_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs37, vs2, vs9 // a2_r*b0_i, a2_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs38, vs3, vs8 // a3_r*b0_r, a3_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x8 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=4 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs36, 0 + xxspltw vs9, vs36, 1 + xxspltw vs10, vs36, 2 + xxspltw vs11, vs36, 3 + + + xxspltw vs12, vs37, 0 + xxspltw vs13, vs37, 1 + xxspltw vs14, vs37, 2 + xxspltw vs15, vs37, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=6 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs38, 0 + xxspltw vs9, vs38, 1 + xxspltw vs10, vs38, 2 + xxspltw vs11, vs38, 3 + + + xxspltw vs12, vs39, 0 + xxspltw vs13, vs39, 1 + xxspltw vs14, vs39, 2 + xxspltw vs15, vs39, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x4_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + lxvw4x vs5, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs5, vs16 // a5_r*b0_r, a5_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmulsp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x4_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + lxvw4x vs1, o16, AO // load a2, a3 + + + addi AO, AO, 32 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + xvmaddasp vs34, vs1, vs8 // a1_r*b0_r, a1_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x4 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + +// N=0 M=2 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs34, 0 + xxspltw vs9, vs34, 1 + xxspltw vs10, vs34, 2 + xxspltw vs11, vs34, 3 + + + xxspltw vs12, vs35, 0 + xxspltw vs13, vs35, 1 + xxspltw vs14, vs35, 2 + xxspltw vs15, vs35, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x2_I1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_1 + + + lxvw4x vs4, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs16, vs24, 0 + xxspltw vs17, vs24, 1 + xxspltw vs18, vs24, 2 + xxspltw vs19, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_2 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_SUBI1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmulsp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro KERNEL1x2_SUB1 + + + lxvw4x vs0, o0, AO // load a0, a1 + + + addi AO, AO, 16 + + lxvw4x vs24, o0, BO // load b0, b1 + + + + xxspltw vs8, vs24, 0 + xxspltw vs9, vs24, 1 + xxspltw vs10, vs24, 2 + xxspltw vs11, vs24, 3 + + + addi BO, BO, 8 + + + xvmaddasp vs32, vs0, vs8 // a0_r*b0_r, a0_i*b0_r, a1_r*b0_r, a1_i*b0_r + xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i + + +.endm + +.macro SAVE1x2 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + xxlxor vs6, vs6, vs6 + xxlxor vs7, vs7, vs7 + +#ifndef TRMMKERNEL + lxvw4x vs0, o0, T2 // c0, c1 +#else + xxlxor vs0, vs0, vs0 +#endif + + + xxspltw vs8, vs32, 0 + xxspltw vs9, vs32, 1 + xxspltw vs10, vs32, 2 + xxspltw vs11, vs32, 3 + + + xxspltw vs12, vs33, 0 + xxspltw vs13, vs33, 1 + xxspltw vs14, vs33, 2 + xxspltw vs15, vs33, 3 + + XVFADD_R1 vs4, vs4, vs8 // add a0_r * b0_r + XVFADD_I2 vs5, vs5, vs12 // add a0_r * b0_i + XVFADD_R1 vs6, vs6, vs10 // add a1_r * b0_r + XVFADD_I2 vs7, vs7, vs14 // add a1_r * b0_i + + XVFADD_R2 vs4, vs4, vs13 // add a0_i * b0_i + XVFADD_I1 vs5, vs5, vs9 // add a0_i * b0_r + XVFADD_R2 vs6, vs6, vs15 // add a1_i * b0_i + XVFADD_I1 vs7, vs7, vs11 // add a1_i * b0_r + + xvmulsp vs16, vs4, alpha_sr // r0_r * alpha_r + xvmulsp vs17, vs5, alpha_si // r0_i * alpha_i + xvmulsp vs18, vs4, alpha_si // r0_r * alpha_i + xvmulsp vs19, vs5, alpha_sr // r0_i * alpha_r + + xvsubsp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xvaddsp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xvmulsp vs16, vs6, alpha_sr // r1_r * alpha_r + xvmulsp vs17, vs7, alpha_si // r1_i * alpha_i + xvmulsp vs18, vs6, alpha_si // r1_r * alpha_i + xvmulsp vs19, vs7, alpha_sr // r1_i * alpha_r + + xvsubsp vs22, vs16, vs17 // r1_r * alpha_r - r1_i * alpha_i + xvaddsp vs23, vs18, vs19 // r1_r * alpha_i + r1_i * alpha_r + + xxlxor vs24, vs24, vs24 + xxsldwi vs20, vs20, vs24, 3 // r0_r + xxsldwi vs21, vs21, vs24, 2 // r0_i + xxsldwi vs22, vs22, vs24, 1 // r1_r + xxsldwi vs23, vs23, vs24, 0 // r1_i + xvaddsp vs20, vs20, vs21 // r0_r, r0_i + xvaddsp vs22, vs22, vs23 // r1_r, r1_i + xvaddsp vs1, vs20, vs22 // r0_r, r0_i, r1_r, r1_i + xvaddsp vs0, vs0, vs1 + + + stxvw4x vs0, o0, T2 // c0, c1 + + addi T2, T2, 16 + add T1, T1, LDC + + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + +.endm + +.macro KERNEL1x1_I1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_1 + + + lxsspx vs4, o0, AO // load a0_r + lxsspx vs5, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs16, o0, T1 // load b0_r + lxsspx vs17, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_2 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + +.endm + +.macro KERNEL1x1_E2 + + + xsmaddadp vs32, vs4, vs16 // a4_r*b0_r + xsmaddadp vs33, vs5, vs17 // a4_i*b0_i + xsmaddadp vs34, vs4, vs17 // a4_r*b0_i + xsmaddadp vs35, vs5, vs16 // a4_i*b0_r + + +.endm + +.macro KERNEL1x1_SUBI1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmuldp vs32, vs0, vs8 // a0_r*b0_r + xsmuldp vs33, vs1, vs9 // a0_i*b0_i + xsmuldp vs34, vs0, vs9 // a0_r*b0_i + xsmuldp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro KERNEL1x1_SUB1 + + + lxsspx vs0, o0, AO // load a0_r + lxsspx vs1, o4, AO // load a0_i + + addi AO, AO, 8 + + mr T1, BO + + lxsspx vs8, o0, T1 // load b0_r + lxsspx vs9, o4, T1 // load b0_i + + addi BO, BO, 8 + + + xsmaddadp vs32, vs0, vs8 // a0_r*b0_r + xsmaddadp vs33, vs1, vs9 // a0_i*b0_i + xsmaddadp vs34, vs0, vs9 // a0_r*b0_i + xsmaddadp vs35, vs1, vs8 // a0_i*b0_r + + +.endm + +.macro SAVE1x1 + + mr T1, CO + +// N=0 + + mr T2, T1 + +// N=0 M=0 + + xxlxor vs4, vs4, vs4 + xxlxor vs5, vs5, vs5 + +#ifndef TRMMKERNEL + lxsspx vs0, o0, T2 // load c0_r + lxsspx vs1, o4, T2 // load c0_i +#else + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 +#endif + + XSFADD_R1 vs4, vs4, vs32 // add a0_r * b0_r + XSFADD_I1 vs5, vs5, vs35 // add a0_r * b0_i + + XSFADD_R2 vs4, vs4, vs33 // add a0_i * b0_i + XSFADD_I2 vs5, vs5, vs34 // add a0_i * b0_r + + xsmuldp vs16, vs4, alpha_dr // r0_r * alpha_r + xsmuldp vs17, vs5, alpha_di // r0_i * alpha_i + xsmuldp vs18, vs4, alpha_di // r0_r * alpha_i + xsmuldp vs19, vs5, alpha_dr // r0_i * alpha_r + + xssubdp vs20, vs16, vs17 // r0_r * alpha_r - r0_i * alpha_i + xsadddp vs21, vs18, vs19 // r0_r * alpha_i + r0_i * alpha_r + + xsadddp vs0, vs0, vs20 + xsadddp vs1, vs1, vs21 + + + stxsspx vs0, o0, T2 // store c0_r + stxsspx vs1, o4, T2 // store c0_i + + addi T2, T2, 8 + add T1, T1, LDC + + addi CO, CO, 8 + +.endm + diff --git a/param.h b/param.h index fb344cd33..d01c992c4 100644 --- a/param.h +++ b/param.h @@ -1979,7 +1979,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 960 #define DGEMM_DEFAULT_P 480 -#define CGEMM_DEFAULT_P 480 +#define CGEMM_DEFAULT_P 720 #define ZGEMM_DEFAULT_P 240 #define SGEMM_DEFAULT_Q 720 From a51102e9b749bf7bb72930c491e9faaf7c1426fc Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 6 Apr 2016 11:15:21 +0200 Subject: [PATCH 41/48] bugfixes for sgemm- and cgemm-kernel --- kernel/power/cgemm_kernel_8x4_power8.S | 8 ++++---- kernel/power/sgemm_kernel_16x8_power8.S | 8 ++++---- param.h | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S index f90069e3f..91a48d190 100644 --- a/kernel/power/cgemm_kernel_8x4_power8.S +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __64BIT__ -#define STACKSIZE 400 +#define STACKSIZE 512 #define ALPHA_R_SP 296(SP) #define ALPHA_I_SP 304(SP) #define FZERO 312(SP) @@ -290,9 +290,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. li o32 , 32 li o48 , 48 - li T1, 256 - slwi T1, T1, 9 // 131072 - sub BBUFFER, A, T1 // temp buffer for B unrolled + li T1, 512 + slwi T1, T1, 16 + add BBUFFER, A, T1 #ifdef __64BIT__ diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S index c2dc1f651..20c94cd94 100644 --- a/kernel/power/sgemm_kernel_16x8_power8.S +++ b/kernel/power/sgemm_kernel_16x8_power8.S @@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __64BIT__ -#define STACKSIZE 340 +#define STACKSIZE 512 #define ALPHA_SP 296(SP) #define FZERO 304(SP) #else @@ -271,9 +271,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. li o32, 32 li o48, 48 - li T1, 256 - slwi T1, T1, 9 // 131072 - sub BBUFFER, A, T1 // temp buffer for B unrolled + li T1, 512 + slwi T1, T1, 16 + add BBUFFER, A, T1 addi T1, SP, 300 stxsspx f1, o0 , T1 diff --git a/param.h b/param.h index d01c992c4..84ef7671a 100644 --- a/param.h +++ b/param.h @@ -1965,7 +1965,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DNUMOPT 8 #define GEMM_DEFAULT_OFFSET_A 131072 -#define GEMM_DEFAULT_OFFSET_B 1024 +#define GEMM_DEFAULT_OFFSET_B 131072 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 16 @@ -1985,12 +1985,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 720 #define DGEMM_DEFAULT_Q 720 #define CGEMM_DEFAULT_Q 720 -#define ZGEMM_DEFAULT_Q 360 +#define ZGEMM_DEFAULT_Q 720 #define SGEMM_DEFAULT_R 14400 #define DGEMM_DEFAULT_R 14400 #define CGEMM_DEFAULT_R 14400 -#define ZGEMM_DEFAULT_R 7200 +#define ZGEMM_DEFAULT_R 14400 #define SYMV_P 8 From d4380c1fe4b9c292092145f85bcd70a2728581d2 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 7 Apr 2016 01:44:18 +0800 Subject: [PATCH 42/48] Refs xianyi/OpenBLAS-CI#10 , Fix sdot for scipy test_iterative.test_convergence test failure on AMD bulldozer and piledriver. --- kernel/x86_64/sdot.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index a6da1fea7..a3d20d276 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -72,18 +72,20 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; BLASLONG ix=0,iy=0; + double dot = 0.0 ; - FLOAT dot = 0.0 ; + FLOAT mydot=0.0; + BLASLONG n1; if ( n <= 0 ) return(dot); if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -32; + n1 = n & (BLASLONG)(-32); if ( n1 ) - sdot_kernel_16(n1, x, y , &dot ); + sdot_kernel_16(n1, x, y , &mydot ); i = n1; @@ -94,12 +96,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) i++ ; } + dot+=mydot; return(dot); } - BLASLONG n1 = n & -2; + n1 = n & (BLASLONG)(-2); while(i < n1) { @@ -124,4 +127,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } - From 9c42f0374a434e18302aa4a7957955dd66fc630b Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Thu, 7 Apr 2016 15:08:15 +0200 Subject: [PATCH 43/48] Updated cgemm- and sgemm-kernel for POWER8 SMP --- common_power.h | 2 +- kernel/power/cgemm_kernel_8x4_power8.S | 36 +++++++++++++++---------- kernel/power/sgemm_kernel_16x8_power8.S | 27 ++++++++++++------- param.h | 8 +++--- 4 files changed, 45 insertions(+), 28 deletions(-) diff --git a/common_power.h b/common_power.h index 052d38828..723d949f2 100644 --- a/common_power.h +++ b/common_power.h @@ -798,7 +798,7 @@ Lmcount$lazy_ptr: #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) #elif defined(POWER8) -#define BUFFER_SIZE ( 64 << 20) +#define BUFFER_SIZE ( 32 << 20) #else #define BUFFER_SIZE ( 16 << 20) #endif diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S index 91a48d190..0c462ce8e 100644 --- a/kernel/power/cgemm_kernel_8x4_power8.S +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __64BIT__ -#define STACKSIZE 512 +#define STACKSIZE 32000 #define ALPHA_R_SP 296(SP) #define ALPHA_I_SP 304(SP) #define FZERO 312(SP) @@ -136,6 +136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha_sr vs30 #define alpha_si vs31 +#define FRAMEPOINTER r12 #define BBUFFER r14 #define L r15 @@ -161,6 +162,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PROFCODE + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE addi SP, SP, -STACKSIZE li r0, 0 @@ -233,37 +238,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef linux #ifdef __64BIT__ - ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #else #ifdef DOUBLE - lwz B, FRAMESLOT(0) + STACKSIZE(SP) - lwz C, FRAMESLOT(1) + STACKSIZE(SP) - lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER) + lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER) + lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER) #else - lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) - ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #else #ifdef DOUBLE - lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER) #else - lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #endif #endif @@ -290,9 +295,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. li o32 , 32 li o48 , 48 - li T1, 512 - slwi T1, T1, 16 - add BBUFFER, A, T1 + addi BBUFFER, SP, 512+4096 + li T1, -4096 + and BBUFFER, BBUFFER, T1 #ifdef __64BIT__ @@ -392,6 +397,9 @@ L999: #endif addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE blr diff --git a/kernel/power/sgemm_kernel_16x8_power8.S b/kernel/power/sgemm_kernel_16x8_power8.S index 20c94cd94..77f3f7cfb 100644 --- a/kernel/power/sgemm_kernel_16x8_power8.S +++ b/kernel/power/sgemm_kernel_16x8_power8.S @@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __64BIT__ -#define STACKSIZE 512 +#define STACKSIZE 32752 #define ALPHA_SP 296(SP) #define FZERO 304(SP) #else @@ -132,6 +132,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define o0 0 +#define FRAMEPOINTER r12 + #define BBUFFER r14 #define o4 r15 #define o12 r16 @@ -160,6 +162,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PROFCODE + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE addi SP, SP, -STACKSIZE li r0, 0 @@ -231,7 +237,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(_AIX) || defined(__APPLE__) #if !defined(__64BIT__) && defined(DOUBLE) - lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #endif @@ -239,17 +245,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(TRMMKERNEL) #if defined(linux) && defined(__64BIT__) - ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) #else #ifdef DOUBLE - lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #else - lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #endif #endif @@ -271,9 +277,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. li o32, 32 li o48, 48 - li T1, 512 - slwi T1, T1, 16 - add BBUFFER, A, T1 + addi BBUFFER, SP, 512+4096 + li T1, -4096 + and BBUFFER, BBUFFER, T1 addi T1, SP, 300 stxsspx f1, o0 , T1 @@ -355,6 +361,9 @@ L999: #endif addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE blr diff --git a/param.h b/param.h index 84ef7671a..2efd9b2c1 100644 --- a/param.h +++ b/param.h @@ -1964,8 +1964,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SNUMOPT 16 #define DNUMOPT 8 -#define GEMM_DEFAULT_OFFSET_A 131072 -#define GEMM_DEFAULT_OFFSET_B 131072 +#define GEMM_DEFAULT_OFFSET_A 4096 +#define GEMM_DEFAULT_OFFSET_B 4096 #define GEMM_DEFAULT_ALIGN 0x03fffUL #define SGEMM_DEFAULT_UNROLL_M 16 @@ -1987,9 +1987,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_Q 720 #define ZGEMM_DEFAULT_Q 720 -#define SGEMM_DEFAULT_R 14400 +#define SGEMM_DEFAULT_R 21600 #define DGEMM_DEFAULT_R 14400 -#define CGEMM_DEFAULT_R 14400 +#define CGEMM_DEFAULT_R 16200 #define ZGEMM_DEFAULT_R 14400 #define SYMV_P 8 From e173c51c0416dade779478b698ccff9429034a7f Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 8 Apr 2016 09:05:37 +0200 Subject: [PATCH 44/48] updated zgemm- and ztrmm-kernel for POWER8 --- kernel/power/zgemm_kernel_8x2_power8.S | 97 +- kernel/power/zgemm_logic_8x2_power8.S | 427 ++-- kernel/power/zgemm_macros_8x2_power8.S | 497 ++-- kernel/power/ztrmm_kernel_8x2_power8.S | 2 +- kernel/power/ztrmm_macros_8x2_power8.S | 3110 ++++++++++++++++++++++++ param.h | 4 +- 6 files changed, 3611 insertions(+), 526 deletions(-) create mode 100644 kernel/power/ztrmm_macros_8x2_power8.S diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S index a7665f749..336b13b1f 100644 --- a/kernel/power/zgemm_kernel_8x2_power8.S +++ b/kernel/power/zgemm_kernel_8x2_power8.S @@ -1,38 +1,3 @@ -/*************************************************************************** -Copyright (c) 2013-2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2016/03/05 Werner Saar (wernsaar@googlemail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ - /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ @@ -82,7 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef __64BIT__ -#define STACKSIZE 320 +#define STACKSIZE 32000 #define ALPHA_R_SP 296(SP) #define ALPHA_I_SP 304(SP) #define FZERO 312(SP) @@ -133,11 +98,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define alpha_r vs30 #define alpha_i vs31 + +#define FRAMEPOINTER r12 + +#define BBUFFER r14 + #define L r15 #define ALPHA r16 #define o24 r17 #define T2 r19 -#define KK r20 +#define BBO r20 #define o8 r21 #define I r22 #define J r23 @@ -156,8 +126,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. PROLOGUE PROFCODE - addi SP, SP, -STACKSIZE - li r0, 0 + mr FRAMEPOINTER, SP + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + addi SP, SP, -STACKSIZE + li r0, 0 stfd f14, 0(SP) stfd f15, 8(SP) @@ -200,6 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. std r17, 256(SP) std r16, 264(SP) std r15, 272(SP) + std r14, 280(SP) #else stw r31, 144(SP) stw r30, 148(SP) @@ -226,37 +201,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef linux #ifdef __64BIT__ - ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld LDC, FRAMESLOT(0) + STACKSIZE(SP) + ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #else #ifdef DOUBLE - lwz B, FRAMESLOT(0) + STACKSIZE(SP) - lwz C, FRAMESLOT(1) + STACKSIZE(SP) - lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) + lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER) + lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER) + lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER) #else - lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) + lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif #endif #endif #ifdef TRMMKERNEL #if defined(linux) && defined(__64BIT__) - ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #if defined(_AIX) || defined(__APPLE__) #ifdef __64BIT__ - ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) + ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #else #ifdef DOUBLE - lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER) #else - lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) + lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif #endif #endif @@ -268,34 +243,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zgemm_macros_8x2_power8.S" cmpwi cr0, M, 0 - ble .L999 + ble L999 cmpwi cr0, N, 0 - ble .L999 + ble L999 cmpwi cr0, K, 0 - ble .L999 + ble L999 slwi LDC, LDC, ZBASE_SHIFT - li PRE, 256 + li PRE, 384 li o8 , 8 li o16 , 16 li o24 , 24 li o32 , 32 li o48 , 48 + addi BBUFFER, SP, 512+4096 + li T1, -4096 + and BBUFFER, BBUFFER, T1 + #ifdef __64BIT__ addi ALPHA, SP, 296 #else addi ALPHA, SP, 224 #endif - lxvdsx alpha_r, 0, ALPHA - lxvdsx alpha_i, o8, ALPHA + lxsdx alpha_r, 0, ALPHA + lxsdx alpha_i, o8, ALPHA - .align 5 + .align 4 #include "zgemm_logic_8x2_power8.S" -.L999: +L999: addi r3, 0, 0 lfd f14, 0(SP) @@ -339,6 +318,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld r17, 256(SP) ld r16, 264(SP) ld r15, 272(SP) + ld r14, 280(SP) #else lwz r31, 144(SP) lwz r30, 148(SP) @@ -360,6 +340,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE + addi SP, SP, STACKSIZE blr diff --git a/kernel/power/zgemm_logic_8x2_power8.S b/kernel/power/zgemm_logic_8x2_power8.S index 5fcade5bf..96612da82 100644 --- a/kernel/power/zgemm_logic_8x2_power8.S +++ b/kernel/power/zgemm_logic_8x2_power8.S @@ -1,83 +1,111 @@ srawi. J, N, 1 - ble .LZGEMM_L2_END + ble ZGEMM_L2_END + +ZGEMM_L2_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 1 + +ZGEMM_L2_COPYB: + + lxvdsx vs4, o0, BO // b0_r + lxvdsx vs5, o8, BO // b0_i + addi BO, BO, 16 + stxvd2x vs4, o0, BBO + stxvd2x vs5, o16, BBO + addic. T1, T1, -1 + addi BBO, BBO, 32 + + bge ZGEMM_L2_COPYB -.LZGEMM_L2_BEGIN: mr CO, C mr AO, A slwi T1, LDC , 1 add C, C, T1 srawi. I, M, 3 - ble .LZGEMM_L2x8_END + ble ZGEMM_L2x8_END -.LZGEMM_L2x8_BEGIN: +ZGEMM_L2x8_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L2x8_SUB0 + ble ZGEMM_L2x8_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L2x8_SUB4 + ble ZGEMM_L2x8_SUB4 -.LZGEMM_L2x8_LOOP_START: +ZGEMM_L2x8_LOOP_START: dcbt AO, PRE + dcbt BO, PRE LOAD2x8_1 dcbt AO, PRE KERNEL2x8_I1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 addic. L, L, -2 - ble .LZGEMM_L2x8_LOOP_END + ble ZGEMM_L2x8_LOOP_END .align 5 -.LZGEMM_L2x8_LOOP: +ZGEMM_L2x8_LOOP: dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 addic. L, L, -1 - bgt .LZGEMM_L2x8_LOOP + bgt ZGEMM_L2x8_LOOP -.LZGEMM_L2x8_LOOP_END: +ZGEMM_L2x8_LOOP_END: dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE KERNEL2x8_1 dcbt AO, PRE + dcbt BO, PRE KERNEL2x8_2 dcbt AO, PRE @@ -88,9 +116,9 @@ KERNEL2x8_1 KERNEL2x8_E2 - b .LZGEMM_L2x8_SUB1 + b ZGEMM_L2x8_SUB1 -.LZGEMM_L2x8_SUB4: +ZGEMM_L2x8_SUB4: dcbt AO, PRE KERNEL2x8_SUBI1 @@ -106,53 +134,53 @@ KERNEL2x8_SUB1 KERNEL2x8_SUB1 - b .LZGEMM_L2x8_SUB1 + b ZGEMM_L2x8_SUB1 -.LZGEMM_L2x8_SUB0: +ZGEMM_L2x8_SUB0: andi. L, K, 7 KERNEL2x8_SUBI1 addic. L, L, -1 - ble .LZGEMM_L2x8_SAVE - b .LZGEMM_L2x8_SUB2 + ble ZGEMM_L2x8_SAVE + b ZGEMM_L2x8_SUB2 -.LZGEMM_L2x8_SUB1: +ZGEMM_L2x8_SUB1: andi. L, K, 7 - ble .LZGEMM_L2x8_SAVE + ble ZGEMM_L2x8_SAVE -.LZGEMM_L2x8_SUB2: +ZGEMM_L2x8_SUB2: KERNEL2x8_SUB1 addic. L, L, -1 - bgt .LZGEMM_L2x8_SUB2 + bgt ZGEMM_L2x8_SUB2 -.LZGEMM_L2x8_SAVE: +ZGEMM_L2x8_SAVE: SAVE2x8 addic. I, I, -1 - bgt .LZGEMM_L2x8_BEGIN + bgt ZGEMM_L2x8_BEGIN -.LZGEMM_L2x8_END: +ZGEMM_L2x8_END: -.LZGEMM_L2x4_BEGIN: +ZGEMM_L2x4_BEGIN: andi. T2, M, 7 - ble .LZGEMM_L2x1_END + ble ZGEMM_L2x1_END andi. T1, M, 4 - ble .LZGEMM_L2x4_END - mr BO, B + ble ZGEMM_L2x4_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L2x4_SUB0 + ble ZGEMM_L2x4_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L2x4_SUB4 + ble ZGEMM_L2x4_SUB4 -.LZGEMM_L2x4_LOOP_START: +ZGEMM_L2x4_LOOP_START: LOAD2x4_1 KERNEL2x4_I1 @@ -166,11 +194,11 @@ KERNEL2x4_2 addic. L, L, -2 - ble .LZGEMM_L2x4_LOOP_END + ble ZGEMM_L2x4_LOOP_END .align 5 -.LZGEMM_L2x4_LOOP: +ZGEMM_L2x4_LOOP: KERNEL2x4_1 KERNEL2x4_2 @@ -183,9 +211,9 @@ KERNEL2x4_2 addic. L, L, -1 - bgt .LZGEMM_L2x4_LOOP + bgt ZGEMM_L2x4_LOOP -.LZGEMM_L2x4_LOOP_END: +ZGEMM_L2x4_LOOP_END: KERNEL2x4_1 KERNEL2x4_2 @@ -197,9 +225,9 @@ KERNEL2x4_1 KERNEL2x4_E2 - b .LZGEMM_L2x4_SUB1 + b ZGEMM_L2x4_SUB1 -.LZGEMM_L2x4_SUB4: +ZGEMM_L2x4_SUB4: KERNEL2x4_SUBI1 KERNEL2x4_SUB1 @@ -211,48 +239,48 @@ KERNEL2x4_SUB1 KERNEL2x4_SUB1 - b .LZGEMM_L2x4_SUB1 + b ZGEMM_L2x4_SUB1 -.LZGEMM_L2x4_SUB0: +ZGEMM_L2x4_SUB0: andi. L, K, 7 KERNEL2x4_SUBI1 addic. L, L, -1 - ble .LZGEMM_L2x4_SAVE - b .LZGEMM_L2x4_SUB2 + ble ZGEMM_L2x4_SAVE + b ZGEMM_L2x4_SUB2 -.LZGEMM_L2x4_SUB1: +ZGEMM_L2x4_SUB1: andi. L, K, 7 - ble .LZGEMM_L2x4_SAVE + ble ZGEMM_L2x4_SAVE -.LZGEMM_L2x4_SUB2: +ZGEMM_L2x4_SUB2: KERNEL2x4_SUB1 addic. L, L, -1 - bgt .LZGEMM_L2x4_SUB2 + bgt ZGEMM_L2x4_SUB2 -.LZGEMM_L2x4_SAVE: +ZGEMM_L2x4_SAVE: SAVE2x4 -.LZGEMM_L2x4_END: +ZGEMM_L2x4_END: -.LZGEMM_L2x2_BEGIN: +ZGEMM_L2x2_BEGIN: andi. T1, M, 2 - ble .LZGEMM_L2x2_END - mr BO, B + ble ZGEMM_L2x2_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L2x2_SUB0 + ble ZGEMM_L2x2_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L2x2_SUB4 + ble ZGEMM_L2x2_SUB4 -.LZGEMM_L2x2_LOOP_START: +ZGEMM_L2x2_LOOP_START: LOAD2x2_1 KERNEL2x2_I1 @@ -266,11 +294,11 @@ KERNEL2x2_2 addic. L, L, -2 - ble .LZGEMM_L2x2_LOOP_END + ble ZGEMM_L2x2_LOOP_END .align 5 -.LZGEMM_L2x2_LOOP: +ZGEMM_L2x2_LOOP: KERNEL2x2_1 KERNEL2x2_2 @@ -283,9 +311,9 @@ KERNEL2x2_2 addic. L, L, -1 - bgt .LZGEMM_L2x2_LOOP + bgt ZGEMM_L2x2_LOOP -.LZGEMM_L2x2_LOOP_END: +ZGEMM_L2x2_LOOP_END: KERNEL2x2_1 KERNEL2x2_2 @@ -297,9 +325,9 @@ KERNEL2x2_1 KERNEL2x2_E2 - b .LZGEMM_L2x2_SUB1 + b ZGEMM_L2x2_SUB1 -.LZGEMM_L2x2_SUB4: +ZGEMM_L2x2_SUB4: KERNEL2x2_SUBI1 KERNEL2x2_SUB1 @@ -311,48 +339,48 @@ KERNEL2x2_SUB1 KERNEL2x2_SUB1 - b .LZGEMM_L2x2_SUB1 + b ZGEMM_L2x2_SUB1 -.LZGEMM_L2x2_SUB0: +ZGEMM_L2x2_SUB0: andi. L, K, 7 KERNEL2x2_SUBI1 addic. L, L, -1 - ble .LZGEMM_L2x2_SAVE - b .LZGEMM_L2x2_SUB2 + ble ZGEMM_L2x2_SAVE + b ZGEMM_L2x2_SUB2 -.LZGEMM_L2x2_SUB1: +ZGEMM_L2x2_SUB1: andi. L, K, 7 - ble .LZGEMM_L2x2_SAVE + ble ZGEMM_L2x2_SAVE -.LZGEMM_L2x2_SUB2: +ZGEMM_L2x2_SUB2: KERNEL2x2_SUB1 addic. L, L, -1 - bgt .LZGEMM_L2x2_SUB2 + bgt ZGEMM_L2x2_SUB2 -.LZGEMM_L2x2_SAVE: +ZGEMM_L2x2_SAVE: SAVE2x2 -.LZGEMM_L2x2_END: +ZGEMM_L2x2_END: -.LZGEMM_L2x1_BEGIN: +ZGEMM_L2x1_BEGIN: andi. T1, M, 1 - ble .LZGEMM_L2x1_END - mr BO, B + ble ZGEMM_L2x1_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L2x1_SUB0 + ble ZGEMM_L2x1_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L2x1_SUB4 + ble ZGEMM_L2x1_SUB4 -.LZGEMM_L2x1_LOOP_START: +ZGEMM_L2x1_LOOP_START: LOAD2x1_1 KERNEL2x1_I1 @@ -366,11 +394,11 @@ KERNEL2x1_2 addic. L, L, -2 - ble .LZGEMM_L2x1_LOOP_END + ble ZGEMM_L2x1_LOOP_END .align 5 -.LZGEMM_L2x1_LOOP: +ZGEMM_L2x1_LOOP: KERNEL2x1_1 KERNEL2x1_2 @@ -383,9 +411,9 @@ KERNEL2x1_2 addic. L, L, -1 - bgt .LZGEMM_L2x1_LOOP + bgt ZGEMM_L2x1_LOOP -.LZGEMM_L2x1_LOOP_END: +ZGEMM_L2x1_LOOP_END: KERNEL2x1_1 KERNEL2x1_2 @@ -397,9 +425,9 @@ KERNEL2x1_1 KERNEL2x1_E2 - b .LZGEMM_L2x1_SUB1 + b ZGEMM_L2x1_SUB1 -.LZGEMM_L2x1_SUB4: +ZGEMM_L2x1_SUB4: KERNEL2x1_SUBI1 KERNEL2x1_SUB1 @@ -411,72 +439,89 @@ KERNEL2x1_SUB1 KERNEL2x1_SUB1 - b .LZGEMM_L2x1_SUB1 + b ZGEMM_L2x1_SUB1 -.LZGEMM_L2x1_SUB0: +ZGEMM_L2x1_SUB0: andi. L, K, 7 KERNEL2x1_SUBI1 addic. L, L, -1 - ble .LZGEMM_L2x1_SAVE - b .LZGEMM_L2x1_SUB2 + ble ZGEMM_L2x1_SAVE + b ZGEMM_L2x1_SUB2 -.LZGEMM_L2x1_SUB1: +ZGEMM_L2x1_SUB1: andi. L, K, 7 - ble .LZGEMM_L2x1_SAVE + ble ZGEMM_L2x1_SAVE -.LZGEMM_L2x1_SUB2: +ZGEMM_L2x1_SUB2: KERNEL2x1_SUB1 addic. L, L, -1 - bgt .LZGEMM_L2x1_SUB2 + bgt ZGEMM_L2x1_SUB2 -.LZGEMM_L2x1_SAVE: +ZGEMM_L2x1_SAVE: SAVE2x1 -.LZGEMM_L2x1_END: +ZGEMM_L2x1_END: slwi T1, K, 5 add B, B, T1 addic. J, J, -1 - bgt .LZGEMM_L2_BEGIN + bgt ZGEMM_L2_BEGIN andi. T2, N, 1 - ble .L999 + ble L999 -.LZGEMM_L2_END: +ZGEMM_L2_END: - b .LZGEMM_L1_BEGIN + b ZGEMM_L1_BEGIN -.L999_H1: +L999_H1: - b .L999 + b L999 + +ZGEMM_L1_BEGIN: + + mr BO, B + mr BBO, BBUFFER + slwi T1, K, 0 + +ZGEMM_L1_COPYB: + + lxvdsx vs4, o0, BO // b0_r + lxvdsx vs5, o8, BO // b0_i + addi BO, BO, 16 + stxvd2x vs4, o0, BBO + stxvd2x vs5, o16, BBO + addic. T1, T1, -1 + addi BBO, BBO, 32 + + bge ZGEMM_L1_COPYB -.LZGEMM_L1_BEGIN: andi. T1, N, 1 - ble .LZGEMM_L1_END + ble ZGEMM_L1_END mr CO, C mr AO, A srawi. I, M, 3 - ble .LZGEMM_L1x8_END + ble ZGEMM_L1x8_END -.LZGEMM_L1x8_BEGIN: +ZGEMM_L1x8_BEGIN: - mr BO, B + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L1x8_SUB0 + ble ZGEMM_L1x8_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L1x8_SUB4 + ble ZGEMM_L1x8_SUB4 -.LZGEMM_L1x8_LOOP_START: +ZGEMM_L1x8_LOOP_START: dcbt AO, PRE LOAD1x8_1 @@ -499,11 +544,11 @@ KERNEL1x8_2 addic. L, L, -2 - ble .LZGEMM_L1x8_LOOP_END + ble ZGEMM_L1x8_LOOP_END .align 5 -.LZGEMM_L1x8_LOOP: +ZGEMM_L1x8_LOOP: dcbt AO, PRE KERNEL1x8_1 @@ -524,9 +569,9 @@ KERNEL1x8_2 addic. L, L, -1 - bgt .LZGEMM_L1x8_LOOP + bgt ZGEMM_L1x8_LOOP -.LZGEMM_L1x8_LOOP_END: +ZGEMM_L1x8_LOOP_END: dcbt AO, PRE KERNEL1x8_1 @@ -545,9 +590,9 @@ KERNEL1x8_1 KERNEL1x8_E2 - b .LZGEMM_L1x8_SUB1 + b ZGEMM_L1x8_SUB1 -.LZGEMM_L1x8_SUB4: +ZGEMM_L1x8_SUB4: dcbt AO, PRE KERNEL1x8_SUBI1 @@ -563,53 +608,53 @@ KERNEL1x8_SUB1 KERNEL1x8_SUB1 - b .LZGEMM_L1x8_SUB1 + b ZGEMM_L1x8_SUB1 -.LZGEMM_L1x8_SUB0: +ZGEMM_L1x8_SUB0: andi. L, K, 7 KERNEL1x8_SUBI1 addic. L, L, -1 - ble .LZGEMM_L1x8_SAVE - b .LZGEMM_L1x8_SUB2 + ble ZGEMM_L1x8_SAVE + b ZGEMM_L1x8_SUB2 -.LZGEMM_L1x8_SUB1: +ZGEMM_L1x8_SUB1: andi. L, K, 7 - ble .LZGEMM_L1x8_SAVE + ble ZGEMM_L1x8_SAVE -.LZGEMM_L1x8_SUB2: +ZGEMM_L1x8_SUB2: KERNEL1x8_SUB1 addic. L, L, -1 - bgt .LZGEMM_L1x8_SUB2 + bgt ZGEMM_L1x8_SUB2 -.LZGEMM_L1x8_SAVE: +ZGEMM_L1x8_SAVE: SAVE1x8 addic. I, I, -1 - bgt .LZGEMM_L1x8_BEGIN + bgt ZGEMM_L1x8_BEGIN -.LZGEMM_L1x8_END: +ZGEMM_L1x8_END: -.LZGEMM_L1x4_BEGIN: +ZGEMM_L1x4_BEGIN: andi. T2, M, 7 - ble .LZGEMM_L1x1_END + ble ZGEMM_L1x1_END andi. T1, M, 4 - ble .LZGEMM_L1x4_END - mr BO, B + ble ZGEMM_L1x4_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L1x4_SUB0 + ble ZGEMM_L1x4_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L1x4_SUB4 + ble ZGEMM_L1x4_SUB4 -.LZGEMM_L1x4_LOOP_START: +ZGEMM_L1x4_LOOP_START: LOAD1x4_1 KERNEL1x4_I1 @@ -623,11 +668,11 @@ KERNEL1x4_2 addic. L, L, -2 - ble .LZGEMM_L1x4_LOOP_END + ble ZGEMM_L1x4_LOOP_END .align 5 -.LZGEMM_L1x4_LOOP: +ZGEMM_L1x4_LOOP: KERNEL1x4_1 KERNEL1x4_2 @@ -640,9 +685,9 @@ KERNEL1x4_2 addic. L, L, -1 - bgt .LZGEMM_L1x4_LOOP + bgt ZGEMM_L1x4_LOOP -.LZGEMM_L1x4_LOOP_END: +ZGEMM_L1x4_LOOP_END: KERNEL1x4_1 KERNEL1x4_2 @@ -654,9 +699,9 @@ KERNEL1x4_1 KERNEL1x4_E2 - b .LZGEMM_L1x4_SUB1 + b ZGEMM_L1x4_SUB1 -.LZGEMM_L1x4_SUB4: +ZGEMM_L1x4_SUB4: KERNEL1x4_SUBI1 KERNEL1x4_SUB1 @@ -668,48 +713,48 @@ KERNEL1x4_SUB1 KERNEL1x4_SUB1 - b .LZGEMM_L1x4_SUB1 + b ZGEMM_L1x4_SUB1 -.LZGEMM_L1x4_SUB0: +ZGEMM_L1x4_SUB0: andi. L, K, 7 KERNEL1x4_SUBI1 addic. L, L, -1 - ble .LZGEMM_L1x4_SAVE - b .LZGEMM_L1x4_SUB2 + ble ZGEMM_L1x4_SAVE + b ZGEMM_L1x4_SUB2 -.LZGEMM_L1x4_SUB1: +ZGEMM_L1x4_SUB1: andi. L, K, 7 - ble .LZGEMM_L1x4_SAVE + ble ZGEMM_L1x4_SAVE -.LZGEMM_L1x4_SUB2: +ZGEMM_L1x4_SUB2: KERNEL1x4_SUB1 addic. L, L, -1 - bgt .LZGEMM_L1x4_SUB2 + bgt ZGEMM_L1x4_SUB2 -.LZGEMM_L1x4_SAVE: +ZGEMM_L1x4_SAVE: SAVE1x4 -.LZGEMM_L1x4_END: +ZGEMM_L1x4_END: -.LZGEMM_L1x2_BEGIN: +ZGEMM_L1x2_BEGIN: andi. T1, M, 2 - ble .LZGEMM_L1x2_END - mr BO, B + ble ZGEMM_L1x2_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L1x2_SUB0 + ble ZGEMM_L1x2_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L1x2_SUB4 + ble ZGEMM_L1x2_SUB4 -.LZGEMM_L1x2_LOOP_START: +ZGEMM_L1x2_LOOP_START: LOAD1x2_1 KERNEL1x2_I1 @@ -723,11 +768,11 @@ KERNEL1x2_2 addic. L, L, -2 - ble .LZGEMM_L1x2_LOOP_END + ble ZGEMM_L1x2_LOOP_END .align 5 -.LZGEMM_L1x2_LOOP: +ZGEMM_L1x2_LOOP: KERNEL1x2_1 KERNEL1x2_2 @@ -740,9 +785,9 @@ KERNEL1x2_2 addic. L, L, -1 - bgt .LZGEMM_L1x2_LOOP + bgt ZGEMM_L1x2_LOOP -.LZGEMM_L1x2_LOOP_END: +ZGEMM_L1x2_LOOP_END: KERNEL1x2_1 KERNEL1x2_2 @@ -754,9 +799,9 @@ KERNEL1x2_1 KERNEL1x2_E2 - b .LZGEMM_L1x2_SUB1 + b ZGEMM_L1x2_SUB1 -.LZGEMM_L1x2_SUB4: +ZGEMM_L1x2_SUB4: KERNEL1x2_SUBI1 KERNEL1x2_SUB1 @@ -768,48 +813,48 @@ KERNEL1x2_SUB1 KERNEL1x2_SUB1 - b .LZGEMM_L1x2_SUB1 + b ZGEMM_L1x2_SUB1 -.LZGEMM_L1x2_SUB0: +ZGEMM_L1x2_SUB0: andi. L, K, 7 KERNEL1x2_SUBI1 addic. L, L, -1 - ble .LZGEMM_L1x2_SAVE - b .LZGEMM_L1x2_SUB2 + ble ZGEMM_L1x2_SAVE + b ZGEMM_L1x2_SUB2 -.LZGEMM_L1x2_SUB1: +ZGEMM_L1x2_SUB1: andi. L, K, 7 - ble .LZGEMM_L1x2_SAVE + ble ZGEMM_L1x2_SAVE -.LZGEMM_L1x2_SUB2: +ZGEMM_L1x2_SUB2: KERNEL1x2_SUB1 addic. L, L, -1 - bgt .LZGEMM_L1x2_SUB2 + bgt ZGEMM_L1x2_SUB2 -.LZGEMM_L1x2_SAVE: +ZGEMM_L1x2_SAVE: SAVE1x2 -.LZGEMM_L1x2_END: +ZGEMM_L1x2_END: -.LZGEMM_L1x1_BEGIN: +ZGEMM_L1x1_BEGIN: andi. T1, M, 1 - ble .LZGEMM_L1x1_END - mr BO, B + ble ZGEMM_L1x1_END + mr BO, BBUFFER srawi. L, K, 3 - ble .LZGEMM_L1x1_SUB0 + ble ZGEMM_L1x1_SUB0 cmpwi cr0, L, 1 - ble .LZGEMM_L1x1_SUB4 + ble ZGEMM_L1x1_SUB4 -.LZGEMM_L1x1_LOOP_START: +ZGEMM_L1x1_LOOP_START: LOAD1x1_1 KERNEL1x1_I1 @@ -823,11 +868,11 @@ KERNEL1x1_2 addic. L, L, -2 - ble .LZGEMM_L1x1_LOOP_END + ble ZGEMM_L1x1_LOOP_END .align 5 -.LZGEMM_L1x1_LOOP: +ZGEMM_L1x1_LOOP: KERNEL1x1_1 KERNEL1x1_2 @@ -840,9 +885,9 @@ KERNEL1x1_2 addic. L, L, -1 - bgt .LZGEMM_L1x1_LOOP + bgt ZGEMM_L1x1_LOOP -.LZGEMM_L1x1_LOOP_END: +ZGEMM_L1x1_LOOP_END: KERNEL1x1_1 KERNEL1x1_2 @@ -854,9 +899,9 @@ KERNEL1x1_1 KERNEL1x1_E2 - b .LZGEMM_L1x1_SUB1 + b ZGEMM_L1x1_SUB1 -.LZGEMM_L1x1_SUB4: +ZGEMM_L1x1_SUB4: KERNEL1x1_SUBI1 KERNEL1x1_SUB1 @@ -868,34 +913,34 @@ KERNEL1x1_SUB1 KERNEL1x1_SUB1 - b .LZGEMM_L1x1_SUB1 + b ZGEMM_L1x1_SUB1 -.LZGEMM_L1x1_SUB0: +ZGEMM_L1x1_SUB0: andi. L, K, 7 KERNEL1x1_SUBI1 addic. L, L, -1 - ble .LZGEMM_L1x1_SAVE - b .LZGEMM_L1x1_SUB2 + ble ZGEMM_L1x1_SAVE + b ZGEMM_L1x1_SUB2 -.LZGEMM_L1x1_SUB1: +ZGEMM_L1x1_SUB1: andi. L, K, 7 - ble .LZGEMM_L1x1_SAVE + ble ZGEMM_L1x1_SAVE -.LZGEMM_L1x1_SUB2: +ZGEMM_L1x1_SUB2: KERNEL1x1_SUB1 addic. L, L, -1 - bgt .LZGEMM_L1x1_SUB2 + bgt ZGEMM_L1x1_SUB2 -.LZGEMM_L1x1_SAVE: +ZGEMM_L1x1_SAVE: SAVE1x1 -.LZGEMM_L1x1_END: +ZGEMM_L1x1_END: -.LZGEMM_L1_END: +ZGEMM_L1_END: diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S index 701ec65c8..a0fbb2e11 100644 --- a/kernel/power/zgemm_macros_8x2_power8.S +++ b/kernel/power/zgemm_macros_8x2_power8.S @@ -1,39 +1,3 @@ -/*************************************************************************** -Copyright (c) 2013-2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2016/03/05 Werner Saar (wernsaar@googlemail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ - - #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define XSFADD_R1 xsadddp @@ -70,12 +34,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD2x8_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -110,12 +74,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -156,36 +120,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL2x8_1 + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B + + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag xvmaddadp vs34, vs1, vs16 // real*real, imag*real xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag - - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B - xvmaddadp vs36, vs2, vs16 // real*real, imag*real xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag xvmaddadp vs38, vs3, vs16 // real*real, imag*real xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag - - lxvd2x vs8, o0, AO // load real,imag from A - lxvd2x vs9, o16, AO // load real,imag from A - xvmaddadp vs40, vs4, vs16 // real*real, imag*real xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag xvmaddadp vs42, vs5, vs16 // real*real, imag*real xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag - - lxvd2x vs10, o32, AO // load real,imag from A - lxvd2x vs11, o48, AO // load real,imag from A - xvmaddadp vs44, vs6, vs16 // real*real, imag*real xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag - - addi AO, AO, 64 - xvmaddadp vs46, vs7, vs16 // real*real, imag*real xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag @@ -193,101 +162,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag xvmaddadp vs50, vs1, vs18 // real*real, imag*real xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag - - lxvd2x vs12, o0, AO // load real,imag from A - lxvd2x vs13, o16, AO // load real,imag from A - xvmaddadp vs52, vs2, vs18 // real*real, imag*real xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag xvmaddadp vs54, vs3, vs18 // real*real, imag*real xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag - - lxvd2x vs14, o32, AO // load real,imag from A - lxvd2x vs15, o48, AO // load real,imag from A - xvmaddadp vs56, vs4, vs18 // real*real, imag*real xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag xvmaddadp vs58, vs5, vs18 // real*real, imag*real xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag - - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - xvmaddadp vs60, vs6, vs18 // real*real, imag*real xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag xvmaddadp vs62, vs7, vs18 // real*real, imag*real xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag - addi AO, AO, 64 - addi BO, BO, 32 .endm .macro KERNEL2x8_2 + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B + + addi BO, BO, 64 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag xvmaddadp vs34, vs9, vs20 // real*real, imag*real xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag - - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - xvmaddadp vs36, vs10, vs20 // real*real, imag*real xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag xvmaddadp vs38, vs11, vs20 // real*real, imag*real xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag - - lxvd2x vs0, o0, AO // load real,imag from A - lxvd2x vs1, o16, AO // load real,imag from A - xvmaddadp vs40, vs12, vs20 // real*real, imag*real xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag xvmaddadp vs42, vs13, vs20 // real*real, imag*real xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag - - lxvd2x vs2, o32, AO // load real,imag from A - lxvd2x vs3, o48, AO // load real,imag from A - xvmaddadp vs44, vs14, vs20 // real*real, imag*real xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag xvmaddadp vs46, vs15, vs20 // real*real, imag*real xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag - addi AO, AO, 64 - xvmaddadp vs48, vs8, vs22 // real*real, imag*real xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag xvmaddadp vs50, vs9, vs22 // real*real, imag*real xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag - - lxvd2x vs4, o0, AO // load real,imag from A - lxvd2x vs5, o16, AO // load real,imag from A - xvmaddadp vs52, vs10, vs22 // real*real, imag*real xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag xvmaddadp vs54, vs11, vs22 // real*real, imag*real xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag - - lxvd2x vs6, o32, AO // load real,imag from A - lxvd2x vs7, o48, AO // load real,imag from A - xvmaddadp vs56, vs12, vs22 // real*real, imag*real xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag xvmaddadp vs58, vs13, vs22 // real*real, imag*real xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag - - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B - xvmaddadp vs60, vs14, vs22 // real*real, imag*real xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag xvmaddadp vs62, vs15, vs22 // real*real, imag*real xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag - addi AO, AO, 64 - addi BO, BO, 32 .endm @@ -347,12 +294,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -407,12 +354,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -927,12 +874,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD2x4_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -953,12 +900,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -990,12 +937,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -1027,12 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag @@ -1088,12 +1035,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -1125,12 +1072,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -1410,12 +1357,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD2x2_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1432,12 +1379,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -1459,12 +1406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -1486,12 +1433,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag @@ -1529,12 +1476,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -1556,12 +1503,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -1725,12 +1672,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD2x1_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 lxvd2x vs0, o0, AO // load real,imag from A @@ -1745,12 +1692,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -1767,12 +1714,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B - lxvdsx vs22, o16, BO // load real part from B - lxvdsx vs23, o24, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B + lxvd2x vs22, o32, BO // load real part from B + lxvd2x vs23, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -1789,12 +1736,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag @@ -1823,12 +1770,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -1845,12 +1792,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B - lxvdsx vs18, o16, BO // load real part from B - lxvdsx vs19, o24, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B + lxvd2x vs18, o32, BO // load real part from B + lxvd2x vs19, o48, BO // load imag part from B - addi BO, BO, 32 + addi BO, BO, 64 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -1956,10 +1903,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD1x8_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1994,10 +1941,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -2035,10 +1982,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -2076,10 +2023,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag @@ -2140,10 +2087,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -2181,10 +2128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -2452,10 +2399,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD1x4_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2476,10 +2423,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -2502,10 +2449,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -2528,10 +2475,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag @@ -2569,10 +2516,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -2595,10 +2542,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -2748,10 +2695,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD1x2_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2768,10 +2715,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -2788,10 +2735,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -2808,10 +2755,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag @@ -2839,10 +2786,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -2859,10 +2806,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -2954,10 +2901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro LOAD1x1_1 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 lxvd2x vs0, o0, AO // load real,imag from A @@ -2972,10 +2919,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -2989,10 +2936,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs20, o0, BO // load real part from B - lxvdsx vs21, o8, BO // load imag part from B + lxvd2x vs20, o0, BO // load real part from B + lxvd2x vs21, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag @@ -3006,10 +2953,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag @@ -3032,10 +2979,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmuldp vs32, vs0, vs16 // real*real, imag*real xvmuldp vs33, vs0, vs17 // real*imag, imag*imag @@ -3049,10 +2996,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 - lxvdsx vs16, o0, BO // load real part from B - lxvdsx vs17, o8, BO // load imag part from B + lxvd2x vs16, o0, BO // load real part from B + lxvd2x vs17, o16, BO // load imag part from B - addi BO, BO, 16 + addi BO, BO, 32 xvmaddadp vs32, vs0, vs16 // real*real, imag*real xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S index 8b953765e..0cfe613d5 100644 --- a/kernel/power/ztrmm_kernel_8x2_power8.S +++ b/kernel/power/ztrmm_kernel_8x2_power8.S @@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#include "zgemm_macros_8x2_power8.S" +#include "ztrmm_macros_8x2_power8.S" cmpwi cr0, M, 0 ble .L999 diff --git a/kernel/power/ztrmm_macros_8x2_power8.S b/kernel/power/ztrmm_macros_8x2_power8.S new file mode 100644 index 000000000..701ec65c8 --- /dev/null +++ b/kernel/power/ztrmm_macros_8x2_power8.S @@ -0,0 +1,3110 @@ +/*************************************************************************** +Copyright (c) 2013-2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2016/03/05 Werner Saar (wernsaar@googlemail.com) +* BLASTEST : OK +* CTEST : OK +* TEST : OK +* LAPACK-TEST : OK +**************************************************************************************/ + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xsadddp + +#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xsadddp + +#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xsadddp + #define XSFADD_I1 xsadddp + #define XSFADD_I2 xssubdp + +#else // CC || CR || RC || RR + + #define XSFADD_R1 xsadddp + #define XSFADD_R2 xssubdp + #define XSFADD_I1 xssubdp + #define XSFADD_I2 xssubdp + +#endif + +/********************************************************************************************** +* Macros for N=2 and M=8 +**********************************************************************************************/ + +.macro LOAD2x8_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL2x8_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + xvmuldp vs48, vs0, vs18 // real*real, imag*real + xvmuldp vs49, vs0, vs19 // real*imag, imag*imag + xvmuldp vs50, vs1, vs18 // real*real, imag*real + xvmuldp vs51, vs1, vs19 // real*imag, imag*imag + xvmuldp vs52, vs2, vs18 // real*real, imag*real + xvmuldp vs53, vs2, vs19 // real*imag, imag*imag + xvmuldp vs54, vs3, vs18 // real*real, imag*real + xvmuldp vs55, vs3, vs19 // real*imag, imag*imag + xvmuldp vs56, vs4, vs18 // real*real, imag*real + xvmuldp vs57, vs4, vs19 // real*imag, imag*imag + xvmuldp vs58, vs5, vs18 // real*real, imag*real + xvmuldp vs59, vs5, vs19 // real*imag, imag*imag + xvmuldp vs60, vs6, vs18 // real*real, imag*real + xvmuldp vs61, vs6, vs19 // real*imag, imag*imag + xvmuldp vs62, vs7, vs18 // real*real, imag*real + xvmuldp vs63, vs7, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x8_1 + + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + + addi AO, AO, 64 + + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL2x8_2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + addi AO, AO, 64 + + xvmaddadp vs48, vs8, vs22 // real*real, imag*real + xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs50, vs9, vs22 // real*real, imag*real + xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + + xvmaddadp vs52, vs10, vs22 // real*real, imag*real + xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs54, vs11, vs22 // real*real, imag*real + xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag + + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + xvmaddadp vs56, vs12, vs22 // real*real, imag*real + xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag + xvmaddadp vs58, vs13, vs22 // real*real, imag*real + xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag + + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + xvmaddadp vs60, vs14, vs22 // real*real, imag*real + xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag + xvmaddadp vs62, vs15, vs22 // real*real, imag*real + xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag + + addi AO, AO, 64 + addi BO, BO, 32 + +.endm + +.macro KERNEL2x8_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + xvmaddadp vs48, vs8, vs22 // real*real, imag*real + xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs50, vs9, vs22 // real*real, imag*real + xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs52, vs10, vs22 // real*real, imag*real + xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs54, vs11, vs22 // real*real, imag*real + xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag + xvmaddadp vs56, vs12, vs22 // real*real, imag*real + xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag + xvmaddadp vs58, vs13, vs22 // real*real, imag*real + xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag + xvmaddadp vs60, vs14, vs22 // real*real, imag*real + xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag + xvmaddadp vs62, vs15, vs22 // real*real, imag*real + xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x8_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + xvmuldp vs48, vs0, vs18 // real*real, imag*real + xvmuldp vs49, vs0, vs19 // real*imag, imag*imag + xvmuldp vs50, vs1, vs18 // real*real, imag*real + xvmuldp vs51, vs1, vs19 // real*imag, imag*imag + xvmuldp vs52, vs2, vs18 // real*real, imag*real + xvmuldp vs53, vs2, vs19 // real*imag, imag*imag + xvmuldp vs54, vs3, vs18 // real*real, imag*real + xvmuldp vs55, vs3, vs19 // real*imag, imag*imag + xvmuldp vs56, vs4, vs18 // real*real, imag*real + xvmuldp vs57, vs4, vs19 // real*imag, imag*imag + xvmuldp vs58, vs5, vs18 // real*real, imag*real + xvmuldp vs59, vs5, vs19 // real*imag, imag*imag + xvmuldp vs60, vs6, vs18 // real*real, imag*real + xvmuldp vs61, vs6, vs19 // real*imag, imag*imag + xvmuldp vs62, vs7, vs18 // real*real, imag*real + xvmuldp vs63, vs7, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x8_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + xvmaddadp vs48, vs0, vs18 // real*real, imag*real + xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs50, vs1, vs18 // real*real, imag*real + xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs52, vs2, vs18 // real*real, imag*real + xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs54, vs3, vs18 // real*real, imag*real + xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag + xvmaddadp vs56, vs4, vs18 // real*real, imag*real + xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag + xvmaddadp vs58, vs5, vs18 // real*real, imag*real + xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag + xvmaddadp vs60, vs6, vs18 // real*real, imag*real + xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag + xvmaddadp vs62, vs7, vs18 // real*real, imag*real + xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x8 + + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + lxvd2x vs20, o0, T2 + lxvd2x vs21, o16, T2 + lxvd2x vs22, o32, T2 + lxvd2x vs23, o48, T2 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs40 // realA*realB + XSFADD_R2 vs0, vs0, vs41 // imagA*imagB + + xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs40 // realA*imagB + XSFADD_I2 vs1, vs1, vs41 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs12, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs42 // realA*realB + XSFADD_R2 vs0, vs0, vs43 // imagA*imagB + + xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs42 // realA*imagB + XSFADD_I2 vs1, vs1, vs43 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs13, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs44 // realA*realB + XSFADD_R2 vs0, vs0, vs45 // imagA*imagB + + xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs44 // realA*imagB + XSFADD_I2 vs1, vs1, vs45 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs14, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs46 // realA*realB + XSFADD_R2 vs0, vs0, vs47 // imagA*imagB + + xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs46 // realA*imagB + XSFADD_I2 vs1, vs1, vs47 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs15, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + stxvd2x vs12, o0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + lxvd2x vs20, o0, T2 + lxvd2x vs21, o16, T2 + lxvd2x vs22, o32, T2 + lxvd2x vs23, o48, T2 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs48 // realA*realB + XSFADD_R2 vs0, vs0, vs49 // imagA*imagB + + xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs48 // realA*imagB + XSFADD_I2 vs1, vs1, vs49 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs50 // realA*realB + XSFADD_R2 vs0, vs0, vs51 // imagA*imagB + + xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs50 // realA*imagB + XSFADD_I2 vs1, vs1, vs51 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs52 // realA*realB + XSFADD_R2 vs0, vs0, vs53 // imagA*imagB + + xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs52 // realA*imagB + XSFADD_I2 vs1, vs1, vs53 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs54 // realA*realB + XSFADD_R2 vs0, vs0, vs55 // imagA*imagB + + xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs54 // realA*imagB + XSFADD_I2 vs1, vs1, vs55 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs56 // realA*realB + XSFADD_R2 vs0, vs0, vs57 // imagA*imagB + + xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs56 // realA*imagB + XSFADD_I2 vs1, vs1, vs57 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs12, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs58 // realA*realB + XSFADD_R2 vs0, vs0, vs59 // imagA*imagB + + xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs58 // realA*imagB + XSFADD_I2 vs1, vs1, vs59 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs13, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs60 // realA*realB + XSFADD_R2 vs0, vs0, vs61 // imagA*imagB + + xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs60 // realA*imagB + XSFADD_I2 vs1, vs1, vs61 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs14, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs62 // realA*realB + XSFADD_R2 vs0, vs0, vs63 // imagA*imagB + + xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs62 // realA*imagB + XSFADD_I2 vs1, vs1, vs63 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs15, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + stxvd2x vs12, o0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + addi CO, CO, 128 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=4 +**********************************************************************************************/ + +.macro LOAD2x4_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL2x4_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + xvmuldp vs40, vs0, vs18 // real*real, imag*real + xvmuldp vs41, vs0, vs19 // real*imag, imag*imag + xvmuldp vs42, vs1, vs18 // real*real, imag*real + xvmuldp vs43, vs1, vs19 // real*imag, imag*imag + xvmuldp vs44, vs2, vs18 // real*real, imag*real + xvmuldp vs45, vs2, vs19 // real*imag, imag*imag + xvmuldp vs46, vs3, vs18 // real*real, imag*real + xvmuldp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + xvmaddadp vs40, vs8, vs22 // real*real, imag*real + xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs42, vs9, vs22 // real*real, imag*real + xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag + xvmaddadp vs44, vs10, vs22 // real*real, imag*real + xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag + xvmaddadp vs46, vs11, vs22 // real*real, imag*real + xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + xvmuldp vs40, vs0, vs18 // real*real, imag*real + xvmuldp vs41, vs0, vs19 // real*imag, imag*imag + xvmuldp vs42, vs1, vs18 // real*real, imag*real + xvmuldp vs43, vs1, vs19 // real*imag, imag*imag + xvmuldp vs44, vs2, vs18 // real*real, imag*real + xvmuldp vs45, vs2, vs19 // real*imag, imag*imag + xvmuldp vs46, vs3, vs18 // real*real, imag*real + xvmuldp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x4_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + xvmaddadp vs40, vs0, vs18 // real*real, imag*real + xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs42, vs1, vs18 // real*real, imag*real + xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag + xvmaddadp vs44, vs2, vs18 // real*real, imag*real + xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag + xvmaddadp vs46, vs3, vs18 // real*real, imag*real + xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x4 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs40 // realA*realB + XSFADD_R2 vs0, vs0, vs41 // imagA*imagB + + xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs40 // realA*imagB + XSFADD_I2 vs1, vs1, vs41 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs42 // realA*realB + XSFADD_R2 vs0, vs0, vs43 // imagA*imagB + + xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs42 // realA*imagB + XSFADD_I2 vs1, vs1, vs43 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs44 // realA*realB + XSFADD_R2 vs0, vs0, vs45 // imagA*imagB + + xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs44 // realA*imagB + XSFADD_I2 vs1, vs1, vs45 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs46 // realA*realB + XSFADD_R2 vs0, vs0, vs47 // imagA*imagB + + xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs46 // realA*imagB + XSFADD_I2 vs1, vs1, vs47 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=2 +**********************************************************************************************/ + +.macro LOAD2x2_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + +.endm + +.macro KERNEL2x2_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + xvmuldp vs36, vs0, vs18 // real*real, imag*real + xvmuldp vs37, vs0, vs19 // real*imag, imag*imag + xvmuldp vs38, vs1, vs18 // real*real, imag*real + xvmuldp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + xvmaddadp vs36, vs8, vs22 // real*real, imag*real + xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs38, vs9, vs22 // real*real, imag*real + xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + xvmaddadp vs36, vs8, vs22 // real*real, imag*real + xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag + xvmaddadp vs38, vs9, vs22 // real*real, imag*real + xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + xvmuldp vs36, vs0, vs18 // real*real, imag*real + xvmuldp vs37, vs0, vs19 // real*imag, imag*imag + xvmuldp vs38, vs1, vs18 // real*real, imag*real + xvmuldp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x2_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + xvmaddadp vs36, vs0, vs18 // real*real, imag*real + xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag + xvmaddadp vs38, vs1, vs18 // real*real, imag*real + xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x2 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=2 and M=1 +**********************************************************************************************/ + +.macro LOAD2x1_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + +.endm + +.macro KERNEL2x1_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + xvmuldp vs34, vs0, vs18 // real*real, imag*real + xvmuldp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + lxvdsx vs22, o16, BO // load real part from B + lxvdsx vs23, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_2 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + xvmaddadp vs34, vs8, vs22 // real*real, imag*real + xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + xvmaddadp vs34, vs8, vs22 // real*real, imag*real + xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + xvmuldp vs34, vs0, vs18 // real*real, imag*real + xvmuldp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro KERNEL2x1_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + lxvdsx vs18, o16, BO // load real part from B + lxvdsx vs19, o24, BO // load imag part from B + + addi BO, BO, 32 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + xvmaddadp vs34, vs0, vs18 // real*real, imag*real + xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag + + +.endm + +.macro SAVE2x1 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + +#endif + + stxvd2x vs8, o0, T1 + + add T1, T1, LDC + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + +#endif + + stxvd2x vs8, o0, T1 + + add T1, T1, LDC + addi CO, CO, 16 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=8 +**********************************************************************************************/ + +.macro LOAD1x8_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL1x8_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs12, o0, AO // load real,imag from A + lxvd2x vs13, o16, AO // load real,imag from A + lxvd2x vs14, o32, AO // load real,imag from A + lxvd2x vs15, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + xvmaddadp vs40, vs12, vs20 // real*real, imag*real + xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag + xvmaddadp vs42, vs13, vs20 // real*real, imag*real + xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag + xvmaddadp vs44, vs14, vs20 // real*real, imag*real + xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag + xvmaddadp vs46, vs15, vs20 // real*real, imag*real + xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + xvmuldp vs40, vs4, vs16 // real*real, imag*real + xvmuldp vs41, vs4, vs17 // real*imag, imag*imag + xvmuldp vs42, vs5, vs16 // real*real, imag*real + xvmuldp vs43, vs5, vs17 // real*imag, imag*imag + xvmuldp vs44, vs6, vs16 // real*real, imag*real + xvmuldp vs45, vs6, vs17 // real*imag, imag*imag + xvmuldp vs46, vs7, vs16 // real*real, imag*real + xvmuldp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x8_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvd2x vs4, o0, AO // load real,imag from A + lxvd2x vs5, o16, AO // load real,imag from A + lxvd2x vs6, o32, AO // load real,imag from A + lxvd2x vs7, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + xvmaddadp vs40, vs4, vs16 // real*real, imag*real + xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag + xvmaddadp vs42, vs5, vs16 // real*real, imag*real + xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag + xvmaddadp vs44, vs6, vs16 // real*real, imag*real + xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag + xvmaddadp vs46, vs7, vs16 // real*real, imag*real + xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x8 + + + mr T1, CO + addi T2, T1, 64 + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + lxvd2x vs20, o0, T2 + lxvd2x vs21, o16, T2 + lxvd2x vs22, o32, T2 + lxvd2x vs23, o48, T2 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs40 // realA*realB + XSFADD_R2 vs0, vs0, vs41 // imagA*imagB + + xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs40 // realA*imagB + XSFADD_I2 vs1, vs1, vs41 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs12, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs42 // realA*realB + XSFADD_R2 vs0, vs0, vs43 // imagA*imagB + + xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs42 // realA*imagB + XSFADD_I2 vs1, vs1, vs43 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs13, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs44 // realA*realB + XSFADD_R2 vs0, vs0, vs45 // imagA*imagB + + xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs44 // realA*imagB + XSFADD_I2 vs1, vs1, vs45 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs14, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs46 // realA*realB + XSFADD_R2 vs0, vs0, vs47 // imagA*imagB + + xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs46 // realA*imagB + XSFADD_I2 vs1, vs1, vs47 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs15, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + xvadddp vs12, vs12, vs20 + xvadddp vs13, vs13, vs21 + xvadddp vs14, vs14, vs22 + xvadddp vs15, vs15, vs23 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + stxvd2x vs12, o0, T2 + stxvd2x vs13, o16, T2 + stxvd2x vs14, o32, T2 + stxvd2x vs15, o48, T2 + + add T1, T1, LDC + add T2, T2, LDC + addi CO, CO, 128 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=4 +**********************************************************************************************/ + +.macro LOAD1x4_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + +.endm + +.macro KERNEL1x4_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + lxvd2x vs10, o32, AO // load real,imag from A + lxvd2x vs11, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + xvmaddadp vs36, vs10, vs20 // real*real, imag*real + xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag + xvmaddadp vs38, vs11, vs20 // real*real, imag*real + xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + xvmuldp vs36, vs2, vs16 // real*real, imag*real + xvmuldp vs37, vs2, vs17 // real*imag, imag*imag + xvmuldp vs38, vs3, vs16 // real*real, imag*real + xvmuldp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x4_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + lxvd2x vs2, o32, AO // load real,imag from A + lxvd2x vs3, o48, AO // load real,imag from A + + addi AO, AO, 64 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + xvmaddadp vs36, vs2, vs16 // real*real, imag*real + xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag + xvmaddadp vs38, vs3, vs16 // real*real, imag*real + xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x4 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + lxvd2x vs18, o32, T1 + lxvd2x vs19, o48, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs36 // realA*realB + XSFADD_R2 vs0, vs0, vs37 // imagA*imagB + + xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs36 // realA*imagB + XSFADD_I2 vs1, vs1, vs37 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs10, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs38 // realA*realB + XSFADD_R2 vs0, vs0, vs39 // imagA*imagB + + xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs38 // realA*imagB + XSFADD_I2 vs1, vs1, vs39 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs11, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + xvadddp vs10, vs10, vs18 + xvadddp vs11, vs11, vs19 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 + + add T1, T1, LDC + addi CO, CO, 64 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=2 +**********************************************************************************************/ + +.macro LOAD1x2_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + +.endm + +.macro KERNEL1x2_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_1 + + lxvd2x vs8, o0, AO // load real,imag from A + lxvd2x vs9, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_2 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + xvmaddadp vs34, vs9, vs20 // real*real, imag*real + xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + xvmuldp vs34, vs1, vs16 // real*real, imag*real + xvmuldp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x2_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + lxvd2x vs1, o16, AO // load real,imag from A + + addi AO, AO, 32 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + xvmaddadp vs34, vs1, vs16 // real*real, imag*real + xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x2 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + lxvd2x vs17, o16, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs34 // realA*realB + XSFADD_R2 vs0, vs0, vs35 // imagA*imagB + + xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs34 // realA*imagB + XSFADD_I2 vs1, vs1, vs35 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs9, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + xvadddp vs9, vs9, vs17 + +#endif + + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + + add T1, T1, LDC + addi CO, CO, 32 + +.endm + + +/********************************************************************************************** +* Macros for N=1 and M=1 +**********************************************************************************************/ + +.macro LOAD1x1_1 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + +.endm + +.macro KERNEL1x1_I1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_1 + + lxvd2x vs8, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs20, o0, BO // load real part from B + lxvdsx vs21, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_2 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_E2 + + + xvmaddadp vs32, vs8, vs20 // real*real, imag*real + xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_SUBI1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmuldp vs32, vs0, vs16 // real*real, imag*real + xvmuldp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro KERNEL1x1_SUB1 + + lxvd2x vs0, o0, AO // load real,imag from A + + addi AO, AO, 16 + + lxvdsx vs16, o0, BO // load real part from B + lxvdsx vs17, o8, BO // load imag part from B + + addi BO, BO, 16 + + xvmaddadp vs32, vs0, vs16 // real*real, imag*real + xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag + + +.endm + +.macro SAVE1x1 + + + mr T1, CO + +#ifndef TRMMKERNEL + + lxvd2x vs16, o0, T1 + +#endif + + + xxlxor vs0, vs0, vs0 + xxlxor vs1, vs1, vs1 + xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + + XSFADD_R1 vs0, vs0, vs32 // realA*realB + XSFADD_R2 vs0, vs0, vs33 // imagA*imagB + + xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB + xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + + XSFADD_I1 vs1, vs1, vs32 // realA*imagB + XSFADD_I2 vs1, vs1, vs33 // imagA*realB + + xsmuldp vs4, vs0, alpha_r // real*alpha_r + xsmuldp vs5, vs1, alpha_i // imag*alpha_i + xsmuldp vs6, vs0, alpha_i // real*alpha_i + xsmuldp vs7, vs1, alpha_r // imag*alpha_r + + xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i + xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r + xxpermdi vs8, vs2, vs3, 0 // merge real and imag part + + +#ifndef TRMMKERNEL + + xvadddp vs8, vs8, vs16 + +#endif + + stxvd2x vs8, o0, T1 + + add T1, T1, LDC + addi CO, CO, 16 + +.endm + diff --git a/param.h b/param.h index 2efd9b2c1..a6ead4b64 100644 --- a/param.h +++ b/param.h @@ -1980,7 +1980,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 960 #define DGEMM_DEFAULT_P 480 #define CGEMM_DEFAULT_P 720 -#define ZGEMM_DEFAULT_P 240 +#define ZGEMM_DEFAULT_P 480 #define SGEMM_DEFAULT_Q 720 #define DGEMM_DEFAULT_Q 720 @@ -1990,7 +1990,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_R 21600 #define DGEMM_DEFAULT_R 14400 #define CGEMM_DEFAULT_R 16200 -#define ZGEMM_DEFAULT_R 14400 +#define ZGEMM_DEFAULT_R 21600 #define SYMV_P 8 From 08bddde3f3abe3337a1a4177a6a9dbb2428fc87c Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Fri, 8 Apr 2016 10:37:59 +0200 Subject: [PATCH 45/48] updated benchmark Makefile for ESSL --- benchmark/Makefile | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index badd42c6b..8166f3863 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -34,7 +34,8 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread LIBVECLIB = -framework Accelerate ESSL=/opt/ibm/lib -LIBESSL = -lessl $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.2/lib/libxl.a +#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a +LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a ifeq ($(OSNAME), WINNT) @@ -259,7 +260,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ endif essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ - cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl + cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ + slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ @@ -312,6 +314,9 @@ slinpack.mkl : slinpack.$(SUFFIX) slinpack.veclib : slinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +slinpack.essl : slinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Dlinpack #################################################### dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -328,6 +333,9 @@ dlinpack.mkl : dlinpack.$(SUFFIX) dlinpack.veclib : dlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +dlinpack.essl : dlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Clinpack #################################################### clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) @@ -345,6 +353,9 @@ clinpack.mkl : clinpack.$(SUFFIX) clinpack.veclib : clinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +clinpack.essl : clinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Zlinpack #################################################### zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) @@ -362,6 +373,9 @@ zlinpack.mkl : zlinpack.$(SUFFIX) zlinpack.veclib : zlinpack.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +zlinpack.essl : zlinpack.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Scholesky ################################################### scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) From f24d5307cf71f745f74c0064ba7dfc59645041b1 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 12 Apr 2016 22:26:11 +0800 Subject: [PATCH 46/48] Refs #834. Fix zgemv config bug on Steamroller. --- kernel/x86_64/KERNEL.STEAMROLLER | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER index f14c82303..4ec748284 100644 --- a/kernel/x86_64/KERNEL.STEAMROLLER +++ b/kernel/x86_64/KERNEL.STEAMROLLER @@ -24,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c DGEMVNKERNEL = dgemv_n_4.c DGEMVTKERNEL = dgemv_t_4.c -ZGEMVNKERNEL = zgemv_t_4.c +ZGEMVNKERNEL = zgemv_n_4.c ZGEMVTKERNEL = zgemv_t_4.c DCOPYKERNEL = dcopy_bulldozer.S From faa73690e4463fb6fae394207dfe0e4b6e51a094 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 12 Apr 2016 11:49:28 -0400 Subject: [PATCH 47/48] Delete LOCAL_BUFFER_SIZE for other architectures. --- getarch_2nd.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/getarch_2nd.c b/getarch_2nd.c index fad647fed..cf9c578cb 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -64,10 +64,13 @@ int main(int argc, char **argv) { if ((argc >= 2) && (*argv[1] == '1')) { + +#if defined(ARCH_X86) || defined(ARCH_X86_64) printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float))); printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double))); printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float))); printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double))); +#endif #ifdef USE64BITINT printf("#define USE64BITINT\n"); From 1e03a62b676ede721225e2137f35bb7d528498bc Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 12 Apr 2016 15:28:31 -0400 Subject: [PATCH 48/48] Update doc for 0.2.18 version. --- CMakeLists.txt | 2 +- Changelog.txt | 18 ++++++++++++++++++ Makefile.rule | 2 +- appveyor.yml | 2 +- 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ff765ccbe..ead63bff8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4) project(OpenBLAS) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 2) -set(OpenBLAS_PATCH_VERSION 18.dev) +set(OpenBLAS_PATCH_VERSION 18) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") enable_language(ASM) diff --git a/Changelog.txt b/Changelog.txt index c59166c38..7f82e8e88 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,22 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.18 +12-Apr-2016 +common: + * If you set MAKE_NB_JOBS flag less or equal than zero, + make will be without -j. + +x86/x86_64: + * Support building Visual Studio static library. (#813, Thanks, theoractice) + * Fix bugs to pass buidbot CI tests (http://build.openblas.net) + +ARM: + * Provide DGEMM 8x4 kernel for Cortex-A57 (Thanks, Ashwin Sekhar T K) + +POWER: + * Optimize S and C BLAS3 on Power8 + * Optimize BLAS2/1 on Power8 + ==================================================================== Version 0.2.17 20-Mar-2016 diff --git a/Makefile.rule b/Makefile.rule index 27aa5a539..d8db6102c 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.18.dev +VERSION = 0.2.18 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/appveyor.yml b/appveyor.yml index d9359e99a..5360a9ef9 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,4 +1,4 @@ -version: 0.2.17.{build} +version: 0.2.18.{build} #environment: