optimized dtrsm_kernel_LT for POWER8tags/v0.2.19^2
| @@ -262,7 +262,8 @@ endif | |||
| essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ | |||
| cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ | |||
| slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \ | |||
| scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl | |||
| scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \ | |||
| strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl | |||
| veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ | |||
| scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ | |||
| @@ -696,6 +697,9 @@ strsm.mkl : strsm.$(SUFFIX) | |||
| strsm.veclib : strsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| strsm.essl : strsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Dtrsm #################################################### | |||
| dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| @@ -712,6 +716,9 @@ dtrsm.mkl : dtrsm.$(SUFFIX) | |||
| dtrsm.veclib : dtrsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| dtrsm.essl : dtrsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Ctrsm #################################################### | |||
| ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME) | |||
| @@ -729,6 +736,9 @@ ctrsm.mkl : ctrsm.$(SUFFIX) | |||
| ctrsm.veclib : ctrsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ctrsm.essl : ctrsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Ztrsm #################################################### | |||
| ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME) | |||
| @@ -746,6 +756,9 @@ ztrsm.mkl : ztrsm.$(SUFFIX) | |||
| ztrsm.veclib : ztrsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ztrsm.essl : ztrsm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Ssyrk #################################################### | |||
| ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| @@ -54,7 +54,7 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| @@ -0,0 +1,294 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #ifndef __64BIT__ | |||
| #define LOAD lwz | |||
| #else | |||
| #define LOAD ld | |||
| #endif | |||
| #ifdef __64BIT__ | |||
| #define STACKSIZE 320 | |||
| #define ALPHA 296(SP) | |||
| #define FZERO 304(SP) | |||
| #else | |||
| #define STACKSIZE 240 | |||
| #define ALPHA 224(SP) | |||
| #define FZERO 232(SP) | |||
| #endif | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #ifdef linux | |||
| #ifndef __64BIT__ | |||
| #define A r6 | |||
| #define B r7 | |||
| #define C r8 | |||
| #define LDC r9 | |||
| #define OFFSET r10 | |||
| #else | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #endif | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r7 | |||
| #define OFFSET r6 | |||
| #else | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #endif | |||
| #endif | |||
| #define o0 0 | |||
| #define PRE r15 | |||
| #define T4 r16 | |||
| #define L r17 | |||
| #define T3 r18 | |||
| #define T2 r19 | |||
| #define KK r20 | |||
| #define I r21 | |||
| #define J r22 | |||
| #define AO r23 | |||
| #define BO r24 | |||
| #define CO r25 | |||
| #define o8 r26 | |||
| #define o16 r27 | |||
| #define o24 r28 | |||
| #define o32 r29 | |||
| #define o48 r30 | |||
| #define T1 r31 | |||
| #include "dtrsm_macros_LT_16x4_power8.S" | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| #else | |||
| stw r31, 144(SP) | |||
| stw r30, 148(SP) | |||
| stw r29, 152(SP) | |||
| stw r28, 156(SP) | |||
| stw r27, 160(SP) | |||
| stw r26, 164(SP) | |||
| stw r25, 168(SP) | |||
| stw r24, 172(SP) | |||
| stw r23, 176(SP) | |||
| stw r22, 180(SP) | |||
| stw r21, 184(SP) | |||
| stw r20, 188(SP) | |||
| stw r19, 192(SP) | |||
| stw r18, 196(SP) | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| #if defined(linux) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #ifdef __64BIT__ | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #else | |||
| #ifdef DOUBLE | |||
| lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #else | |||
| lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| cmpwi cr0, M, 0 | |||
| ble L999 | |||
| cmpwi cr0, N, 0 | |||
| ble L999 | |||
| cmpwi cr0, K, 0 | |||
| ble L999 | |||
| slwi LDC, LDC, BASE_SHIFT | |||
| li o8, 8 | |||
| li o16, 16 | |||
| li o24, 24 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| li PRE, 384 | |||
| mr KK, OFFSET | |||
| #include "dtrsm_logic_LT_16x4_power8.S" | |||
| L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| #else | |||
| lwz r31, 144(SP) | |||
| lwz r30, 148(SP) | |||
| lwz r29, 152(SP) | |||
| lwz r28, 156(SP) | |||
| lwz r27, 160(SP) | |||
| lwz r26, 164(SP) | |||
| lwz r25, 168(SP) | |||
| lwz r24, 172(SP) | |||
| lwz r23, 176(SP) | |||
| lwz r22, 180(SP) | |||
| lwz r21, 184(SP) | |||
| lwz r20, 188(SP) | |||
| lwz r19, 192(SP) | |||
| lwz r18, 196(SP) | |||
| #endif | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -0,0 +1,758 @@ | |||
| srawi. J, N, 2 | |||
| ble DSTRM_LT_L4_END | |||
| DSTRM_LT_L4_BEGIN: | |||
| mr CO, C | |||
| mr AO, A | |||
| slwi T1, LDC , 2 | |||
| add C, C, T1 | |||
| mr KK, OFFSET | |||
| srawi. I, M, 4 | |||
| ble DSTRM_LT_L4x16_END | |||
| DSTRM_LT_L4x16_BEGIN: | |||
| mr BO, B | |||
| li L, -128 | |||
| mr T1, CO | |||
| add T2, T1, LDC | |||
| add T3, T2, LDC | |||
| add T4, T3, LDC | |||
| and T1, T1, L | |||
| and T2, T2, L | |||
| and T3, T3, L | |||
| and T4, T4, L | |||
| dcbt T1, r0 | |||
| dcbt T2, r0 | |||
| dcbt T3, r0 | |||
| dcbt T4, r0 | |||
| addi T1, T1, 128 | |||
| addi T2, T2, 128 | |||
| addi T3, T3, 128 | |||
| addi T4, T4, 128 | |||
| dcbt T1, r0 | |||
| dcbt T2, r0 | |||
| dcbt T3, r0 | |||
| dcbt T4, r0 | |||
| DSTRM_LT_L4x16_LOOP_START: | |||
| INIT_16x4 | |||
| addic. L, KK, 0 | |||
| ble- DSTRM_LT_L4x16_SAVE | |||
| DSTRM_LT_L4x16_LOOP: | |||
| dcbt AO, PRE | |||
| dcbt BO, PRE | |||
| KERNEL_16x4 | |||
| addic. L, L, -1 | |||
| ble- DSTRM_LT_L4x16_SAVE | |||
| dcbt AO, PRE | |||
| KERNEL_16x4 | |||
| addic. L, L, -1 | |||
| ble- DSTRM_LT_L4x16_SAVE | |||
| dcbt AO, PRE | |||
| KERNEL_16x4 | |||
| addic. L, L, -1 | |||
| ble- DSTRM_LT_L4x16_SAVE | |||
| dcbt AO, PRE | |||
| KERNEL_16x4 | |||
| addic. L, L, -1 | |||
| bgt+ DSTRM_LT_L4x16_LOOP | |||
| DSTRM_LT_L4x16_SAVE: | |||
| SOLVE_LT_16x4 | |||
| addi CO, CO, 16*SIZE | |||
| sub T3, K, KK | |||
| sub T4, K, KK | |||
| slwi T3, T3, 4+BASE_SHIFT | |||
| slwi T4, T4, 2+BASE_SHIFT | |||
| add AO, AO, T3 | |||
| add BO, BO, T4 | |||
| addi KK, KK, 16 | |||
| addic. I, I, -1 | |||
| bgt DSTRM_LT_L4x16_BEGIN | |||
| DSTRM_LT_L4x16_END: | |||
| DSTRM_LT_L4x8_BEGIN: | |||
| andi. T2, M, 15 | |||
| ble DSTRM_LT_L4x1_END | |||
| andi. T1, M, 8 | |||
| ble DSTRM_LT_L4x8_END | |||
| mr BO, B | |||
| DSTRM_LT_L4x8_LOOP_START: | |||
| INIT_8x4 | |||
| addic. L, KK, 0 | |||
| ble DSTRM_LT_L4x8_SAVE | |||
| DSTRM_LT_L4x8_LOOP: | |||
| KERNEL_8x4 | |||
| addic. L, L, -1 | |||
| bgt DSTRM_LT_L4x8_LOOP | |||
| DSTRM_LT_L4x8_SAVE: | |||
| SOLVE_LT_8x4 | |||
| addi CO, CO, 8*SIZE | |||
| sub T3, K, KK | |||
| sub T4, K, KK | |||
| slwi T3, T3, 3+BASE_SHIFT | |||
| slwi T4, T4, 2+BASE_SHIFT | |||
| add AO, AO, T3 | |||
| add BO, BO, T4 | |||
| addi KK, KK, 8 | |||
| DSTRM_LT_L4x8_END: | |||
| DSTRM_LT_L4x4_BEGIN: | |||
| andi. T1, M, 4 | |||
| ble DSTRM_LT_L4x4_END | |||
| mr BO, B | |||
| DSTRM_LT_L4x4_LOOP_START: | |||
| INIT_4x4 | |||
| addic. L, KK, 0 | |||
| ble DSTRM_LT_L4x4_SAVE | |||
| DSTRM_LT_L4x4_LOOP: | |||
| KERNEL_4x4 | |||
| addic. L, L, -1 | |||
| bgt DSTRM_LT_L4x4_LOOP | |||
| DSTRM_LT_L4x4_SAVE: | |||
| SOLVE_LT_4x4 | |||
| addi CO, CO, 4*SIZE | |||
| sub T3, K, KK | |||
| sub T4, K, KK | |||
| slwi T3, T3, 2+BASE_SHIFT | |||
| slwi T4, T4, 2+BASE_SHIFT | |||
| add AO, AO, T3 | |||
| add BO, BO, T4 | |||
| addi KK, KK, 4 | |||
| DSTRM_LT_L4x4_END: | |||
| DSTRM_LT_L4x2_BEGIN: | |||
| andi. T1, M, 2 | |||
| ble DSTRM_LT_L4x2_END | |||
| mr BO, B | |||
| DSTRM_LT_L4x2_LOOP_START: | |||
| INIT_2x4 | |||
| addic. L, KK, 0 | |||
| ble DSTRM_LT_L4x2_SAVE | |||
| DSTRM_LT_L4x2_LOOP: | |||
| KERNEL_2x4 | |||
| addic. L, L, -1 | |||
| bgt DSTRM_LT_L4x2_LOOP | |||
| DSTRM_LT_L4x2_SAVE: | |||
| SOLVE_LT_2x4 | |||
| addi CO, CO, 2*SIZE | |||
| sub T3, K, KK | |||
| sub T4, K, KK | |||
| slwi T3, T3, 1+BASE_SHIFT | |||
| slwi T4, T4, 2+BASE_SHIFT | |||
| add AO, AO, T3 | |||
| add BO, BO, T4 | |||
| addi KK, KK, 2 | |||
| DSTRM_LT_L4x2_END: | |||
| DSTRM_LT_L4x1_BEGIN: | |||
| andi. T1, M, 1 | |||
| ble DSTRM_LT_L4x1_END | |||
| mr BO, B | |||
| DSTRM_LT_L4x1_LOOP_START: | |||
| INIT_1x4 | |||
| addic. L, KK, 0 | |||
| ble DSTRM_LT_L4x1_SAVE | |||
| DSTRM_LT_L4x1_LOOP: | |||
| KERNEL_1x4 | |||
| addic. L, L, -1 | |||
| bgt DSTRM_LT_L4x1_LOOP | |||
| DSTRM_LT_L4x1_SAVE: | |||
| SOLVE_LT_1x4 | |||
| addi CO, CO, 1*SIZE | |||
| sub T3, K, KK | |||
| sub T4, K, KK | |||
| slwi T3, T3, 0+BASE_SHIFT | |||
| slwi T4, T4, 2+BASE_SHIFT | |||
| add AO, AO, T3 | |||
| add BO, BO, T4 | |||
| addi KK, KK, 1 | |||
| DSTRM_LT_L4x1_END: | |||
| slwi T1, K, 2+BASE_SHIFT | |||
| add B, B, T1 | |||
| addic. J, J, -1 | |||
| bgt DSTRM_LT_L4_BEGIN | |||
| andi. T2, N, 3 | |||
| ble L999 | |||
| DSTRM_LT_L4_END: | |||
| b DSTRM_LT_L2_BEGIN | |||
| L999_H1: | |||
| b L999 | |||
| DSTRM_LT_L2_BEGIN: | |||
| andi. T1, N, 2 | |||
| ble DSTRM_LT_L2_END | |||
| mr CO, C | |||
| mr AO, A | |||
| slwi T1, LDC , 1 | |||
| add C, C, T1 | |||
| mr KK, OFFSET | |||
| srawi. I, M, 4 | |||
| ble DSTRM_LT_L2x16_END | |||
| DSTRM_LT_L2x16_BEGIN: | |||
| mr BO, B | |||
| DSTRM_LT_L2x16_LOOP_START: | |||
| INIT_16x2 | |||
| addic. L, KK, 0 | |||
| ble DSTRM_LT_L2x16_SAVE | |||
| DSTRM_LT_L2x16_LOOP: | |||
| KERNEL_16x2 | |||
| addic. L, L, -1 | |||
| bgt DSTRM_LT_L2x16_LOOP | |||
| DSTRM_LT_L2x16_SAVE: | |||
| SOLVE_LT_16x2 | |||
| addi CO, CO, 16*SIZE | |||
| sub T3, K, KK | |||
| sub T4, K, KK | |||
| slwi T3, T3, 4+BASE_SHIFT | |||
| slwi T4, T4, 1+BASE_SHIFT | |||
| add AO, AO, T3 | |||
| add BO, BO, T4 | |||
| addi KK, KK, 16 | |||
| addic. I, I, -1 | |||
| bgt DSTRM_LT_L2x16_BEGIN | |||
| DSTRM_LT_L2x16_END: | |||
| DSTRM_LT_L2x8_BEGIN: | |||
| andi. T2, M, 15 | |||
| ble DSTRM_LT_L2x1_END | |||
| andi. T1, M, 8 | |||
| ble DSTRM_LT_L2x8_END | |||
| mr BO, B | |||
| DSTRM_LT_L2x8_LOOP_START: | |||
| INIT_8x2 | |||
| addic. L, KK, 0 | |||
| ble DSTRM_LT_L2x8_SAVE | |||
| DSTRM_LT_L2x8_LOOP: | |||
| KERNEL_8x2 | |||
| addic. L, L, -1 | |||
| bgt DSTRM_LT_L2x8_LOOP | |||
| DSTRM_LT_L2x8_SAVE: | |||
| SOLVE_LT_8x2 | |||
| addi CO, CO, 8*SIZE | |||
| sub T3, K, KK | |||
| sub T4, K, KK | |||
| slwi T3, T3, 3+BASE_SHIFT | |||
| slwi T4, T4, 1+BASE_SHIFT | |||
| add AO, AO, T3 | |||
| add BO, BO, T4 | |||
| addi KK, KK, 8 | |||
| DSTRM_LT_L2x8_END: | |||
| DSTRM_LT_L2x4_BEGIN: | |||
| andi. T1, M, 4 | |||
| ble DSTRM_LT_L2x4_END | |||
| mr BO, B | |||
| DSTRM_LT_L2x4_LOOP_START: | |||
| INIT_4x2 | |||
| addic. L, KK, 0 | |||
| ble DSTRM_LT_L2x4_SAVE | |||
| DSTRM_LT_L2x4_LOOP: | |||
| KERNEL_4x2 | |||
| addic. L, L, -1 | |||
| bgt DSTRM_LT_L2x4_LOOP | |||
| DSTRM_LT_L2x4_SAVE: | |||
| SOLVE_LT_4x2 | |||
| addi CO, CO, 4*SIZE | |||
| sub T3, K, KK | |||
| sub T4, K, KK | |||
| slwi T3, T3, 2+BASE_SHIFT | |||
| slwi T4, T4, 1+BASE_SHIFT | |||
| add AO, AO, T3 | |||
| add BO, BO, T4 | |||
| addi KK, KK, 4 | |||
| DSTRM_LT_L2x4_END: | |||
| DSTRM_LT_L2x2_BEGIN: | |||
| andi. T1, M, 2 | |||
| ble DSTRM_LT_L2x2_END | |||
| mr BO, B | |||
| DSTRM_LT_L2x2_LOOP_START: | |||
| INIT_2x2 | |||
| addic. L, KK, 0 | |||
| ble DSTRM_LT_L2x2_SAVE | |||
| DSTRM_LT_L2x2_LOOP: | |||
| KERNEL_2x2 | |||
| addic. L, L, -1 | |||
| bgt DSTRM_LT_L2x2_LOOP | |||
| DSTRM_LT_L2x2_SAVE: | |||
| SOLVE_LT_2x2 | |||
| addi CO, CO, 2*SIZE | |||
| sub T3, K, KK | |||
| sub T4, K, KK | |||
| slwi T3, T3, 1+BASE_SHIFT | |||
| slwi T4, T4, 1+BASE_SHIFT | |||
| add AO, AO, T3 | |||
| add BO, BO, T4 | |||
| addi KK, KK, 2 | |||
| DSTRM_LT_L2x2_END: | |||
| DSTRM_LT_L2x1_BEGIN: | |||
| andi. T1, M, 1 | |||
| ble DSTRM_LT_L2x1_END | |||
| mr BO, B | |||
| DSTRM_LT_L2x1_LOOP_START: | |||
| INIT_1x2 | |||
| addic. L, KK, 0 | |||
| ble DSTRM_LT_L2x1_SAVE | |||
| DSTRM_LT_L2x1_LOOP: | |||
| KERNEL_1x2 | |||
| addic. L, L, -1 | |||
| bgt DSTRM_LT_L2x1_LOOP | |||
| DSTRM_LT_L2x1_SAVE: | |||
| SOLVE_LT_1x2 | |||
| addi CO, CO, 1*SIZE | |||
| sub T3, K, KK | |||
| sub T4, K, KK | |||
| slwi T3, T3, 0+BASE_SHIFT | |||
| slwi T4, T4, 1+BASE_SHIFT | |||
| add AO, AO, T3 | |||
| add BO, BO, T4 | |||
| addi KK, KK, 1 | |||
| DSTRM_LT_L2x1_END: | |||
| slwi T1, K, 1+BASE_SHIFT | |||
| add B, B, T1 | |||
| DSTRM_LT_L2_END: | |||
| DSTRM_LT_L1_BEGIN: | |||
| andi. T1, N, 1 | |||
| ble DSTRM_LT_L1_END | |||
| mr CO, C | |||
| mr AO, A | |||
| mr KK, OFFSET | |||
| srawi. I, M, 4 | |||
| ble DSTRM_LT_L1x16_END | |||
| DSTRM_LT_L1x16_BEGIN: | |||
| mr BO, B | |||
| DSTRM_LT_L1x16_LOOP_START: | |||
| INIT_16x1 | |||
| addic. L, KK, 0 | |||
| ble DSTRM_LT_L1x16_SAVE | |||
| DSTRM_LT_L1x16_LOOP: | |||
| KERNEL_16x1 | |||
| addic. L, L, -1 | |||
| bgt DSTRM_LT_L1x16_LOOP | |||
| DSTRM_LT_L1x16_SAVE: | |||
| SOLVE_LT_16x1 | |||
| addi CO, CO, 16*SIZE | |||
| sub T3, K, KK | |||
| sub T4, K, KK | |||
| slwi T3, T3, 4+BASE_SHIFT | |||
| slwi T4, T4, 0+BASE_SHIFT | |||
| add AO, AO, T3 | |||
| add BO, BO, T4 | |||
| addi KK, KK, 16 | |||
| addic. I, I, -1 | |||
| bgt DSTRM_LT_L1x16_BEGIN | |||
| DSTRM_LT_L1x16_END: | |||
| DSTRM_LT_L1x8_BEGIN: | |||
| andi. T1, M, 8 | |||
| ble DSTRM_LT_L1x8_END | |||
| mr BO, B | |||
| DSTRM_LT_L1x8_LOOP_START: | |||
| INIT_8x1 | |||
| addic. L, KK, 0 | |||
| ble DSTRM_LT_L1x8_SAVE | |||
| DSTRM_LT_L1x8_LOOP: | |||
| KERNEL_8x1 | |||
| addic. L, L, -1 | |||
| bgt DSTRM_LT_L1x8_LOOP | |||
| DSTRM_LT_L1x8_SAVE: | |||
| SOLVE_LT_8x1 | |||
| addi CO, CO, 8*SIZE | |||
| sub T3, K, KK | |||
| sub T4, K, KK | |||
| slwi T3, T3, 3+BASE_SHIFT | |||
| slwi T4, T4, 0+BASE_SHIFT | |||
| add AO, AO, T3 | |||
| add BO, BO, T4 | |||
| addi KK, KK, 8 | |||
| DSTRM_LT_L1x8_END: | |||
| DSTRM_LT_L1x4_BEGIN: | |||
| andi. T1, M, 4 | |||
| ble DSTRM_LT_L1x4_END | |||
| mr BO, B | |||
| DSTRM_LT_L1x4_LOOP_START: | |||
| INIT_4x1 | |||
| addic. L, KK, 0 | |||
| ble DSTRM_LT_L1x4_SAVE | |||
| DSTRM_LT_L1x4_LOOP: | |||
| KERNEL_4x1 | |||
| addic. L, L, -1 | |||
| bgt DSTRM_LT_L1x4_LOOP | |||
| DSTRM_LT_L1x4_SAVE: | |||
| SOLVE_LT_4x1 | |||
| addi CO, CO, 4*SIZE | |||
| sub T3, K, KK | |||
| sub T4, K, KK | |||
| slwi T3, T3, 2+BASE_SHIFT | |||
| slwi T4, T4, 0+BASE_SHIFT | |||
| add AO, AO, T3 | |||
| add BO, BO, T4 | |||
| addi KK, KK, 4 | |||
| DSTRM_LT_L1x4_END: | |||
| DSTRM_LT_L1x2_BEGIN: | |||
| andi. T1, M, 2 | |||
| ble DSTRM_LT_L1x2_END | |||
| mr BO, B | |||
| DSTRM_LT_L1x2_LOOP_START: | |||
| INIT_2x1 | |||
| addic. L, KK, 0 | |||
| ble DSTRM_LT_L1x2_SAVE | |||
| DSTRM_LT_L1x2_LOOP: | |||
| KERNEL_2x1 | |||
| addic. L, L, -1 | |||
| bgt DSTRM_LT_L1x2_LOOP | |||
| DSTRM_LT_L1x2_SAVE: | |||
| SOLVE_LT_2x1 | |||
| addi CO, CO, 2*SIZE | |||
| sub T3, K, KK | |||
| sub T4, K, KK | |||
| slwi T3, T3, 1+BASE_SHIFT | |||
| slwi T4, T4, 0+BASE_SHIFT | |||
| add AO, AO, T3 | |||
| add BO, BO, T4 | |||
| addi KK, KK, 2 | |||
| DSTRM_LT_L1x2_END: | |||
| DSTRM_LT_L1x1_BEGIN: | |||
| andi. T1, M, 1 | |||
| ble DSTRM_LT_L1x1_END | |||
| mr BO, B | |||
| DSTRM_LT_L1x1_LOOP_START: | |||
| INIT_1x1 | |||
| addic. L, KK, 0 | |||
| ble DSTRM_LT_L1x1_SAVE | |||
| DSTRM_LT_L1x1_LOOP: | |||
| KERNEL_1x1 | |||
| addic. L, L, -1 | |||
| bgt DSTRM_LT_L1x1_LOOP | |||
| DSTRM_LT_L1x1_SAVE: | |||
| SOLVE_LT_1x1 | |||
| addi CO, CO, 1*SIZE | |||
| sub T3, K, KK | |||
| sub T4, K, KK | |||
| slwi T3, T3, 0+BASE_SHIFT | |||
| slwi T4, T4, 0+BASE_SHIFT | |||
| add AO, AO, T3 | |||
| add BO, BO, T4 | |||
| addi KK, KK, 1 | |||
| DSTRM_LT_L1x1_END: | |||
| DSTRM_LT_L1_END: | |||