| @@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| #define HAVE_PREFETCH | #define HAVE_PREFETCH | ||||
| #endif | #endif | ||||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) | |||||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) | |||||
| #define DCBT_ARG 0 | #define DCBT_ARG 0 | ||||
| #else | #else | ||||
| #define DCBT_ARG 8 | #define DCBT_ARG 8 | ||||
| @@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| #define L1_PREFETCH dcbtst | #define L1_PREFETCH dcbtst | ||||
| #endif | #endif | ||||
| #if defined(POWER8) | |||||
| #define L1_DUALFETCH | |||||
| #define L1_PREFETCHSIZE (16 + 128 * 100) | |||||
| #define L1_PREFETCH dcbtst | |||||
| #endif | |||||
| # | |||||
| #ifndef L1_PREFETCH | #ifndef L1_PREFETCH | ||||
| #define L1_PREFETCH dcbt | #define L1_PREFETCH dcbt | ||||
| #endif | #endif | ||||
| @@ -790,6 +797,8 @@ Lmcount$lazy_ptr: | |||||
| #define BUFFER_SIZE ( 2 << 20) | #define BUFFER_SIZE ( 2 << 20) | ||||
| #elif defined(PPC440FP2) | #elif defined(PPC440FP2) | ||||
| #define BUFFER_SIZE ( 16 << 20) | #define BUFFER_SIZE ( 16 << 20) | ||||
| #elif defined(POWER8) | |||||
| #define BUFFER_SIZE ( 64 << 20) | |||||
| #else | #else | ||||
| #define BUFFER_SIZE ( 16 << 20) | #define BUFFER_SIZE ( 16 << 20) | ||||
| #endif | #endif | ||||
| @@ -55,6 +55,7 @@ | |||||
| #define CPUTYPE_POWER6 5 | #define CPUTYPE_POWER6 5 | ||||
| #define CPUTYPE_CELL 6 | #define CPUTYPE_CELL 6 | ||||
| #define CPUTYPE_PPCG4 7 | #define CPUTYPE_PPCG4 7 | ||||
| #define CPUTYPE_POWER8 8 | |||||
| char *cpuname[] = { | char *cpuname[] = { | ||||
| "UNKNOWN", | "UNKNOWN", | ||||
| @@ -65,6 +66,7 @@ char *cpuname[] = { | |||||
| "POWER6", | "POWER6", | ||||
| "CELL", | "CELL", | ||||
| "PPCG4", | "PPCG4", | ||||
| "POWER8" | |||||
| }; | }; | ||||
| char *lowercpuname[] = { | char *lowercpuname[] = { | ||||
| @@ -76,6 +78,7 @@ char *lowercpuname[] = { | |||||
| "power6", | "power6", | ||||
| "cell", | "cell", | ||||
| "ppcg4", | "ppcg4", | ||||
| "power8" | |||||
| }; | }; | ||||
| char *corename[] = { | char *corename[] = { | ||||
| @@ -87,6 +90,7 @@ char *corename[] = { | |||||
| "POWER6", | "POWER6", | ||||
| "CELL", | "CELL", | ||||
| "PPCG4", | "PPCG4", | ||||
| "POWER8" | |||||
| }; | }; | ||||
| int detect(void){ | int detect(void){ | ||||
| @@ -115,7 +119,7 @@ int detect(void){ | |||||
| if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; | if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; | ||||
| if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | ||||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | ||||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6; | |||||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | |||||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | ||||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | ||||
| @@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "POWER5" | #define CORENAME "POWER5" | ||||
| #endif | #endif | ||||
| #if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8) | |||||
| #if defined(FORCE_POWER6) || defined(FORCE_POWER7) | |||||
| #define FORCE | #define FORCE | ||||
| #define ARCHITECTURE "POWER" | #define ARCHITECTURE "POWER" | ||||
| #define SUBARCHITECTURE "POWER6" | #define SUBARCHITECTURE "POWER6" | ||||
| @@ -565,6 +565,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "POWER6" | #define CORENAME "POWER6" | ||||
| #endif | #endif | ||||
| #if defined(FORCE_POWER8) | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "POWER" | |||||
| #define SUBARCHITECTURE "POWER8" | |||||
| #define SUBDIRNAME "power" | |||||
| #define ARCHCONFIG "-DPOWER8 " \ | |||||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \ | |||||
| "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||||
| #define LIBNAME "power8" | |||||
| #define CORENAME "POWER8" | |||||
| #endif | |||||
| #ifdef FORCE_PPCG4 | #ifdef FORCE_PPCG4 | ||||
| #define FORCE | #define FORCE | ||||
| #define ARCHITECTURE "POWER" | #define ARCHITECTURE "POWER" | ||||
| @@ -36,6 +36,11 @@ ifeq ($(CORE), HASWELL) | |||||
| USE_TRMM = 1 | USE_TRMM = 1 | ||||
| endif | endif | ||||
| ifeq ($(CORE), POWER8) | |||||
| USE_TRMM = 1 | |||||
| endif | |||||
| SKERNELOBJS += \ | SKERNELOBJS += \ | ||||
| @@ -1,57 +1,3 @@ | |||||
| SGEMM_BETA = gemm_beta.S | |||||
| DGEMM_BETA = gemm_beta.S | |||||
| CGEMM_BETA = zgemm_beta.S | |||||
| ZGEMM_BETA = zgemm_beta.S | |||||
| ifndef SSYMV_U_KERNEL | |||||
| SSYMV_U_KERNEL = symv_U.S | |||||
| endif | |||||
| ifndef SSYMV_L_KERNEL | |||||
| SSYMV_L_KERNEL = symv_L.S | |||||
| endif | |||||
| ifndef DSYMV_U_KERNEL | |||||
| DSYMV_U_KERNEL = symv_U.S | |||||
| endif | |||||
| ifndef DSYMV_L_KERNEL | |||||
| DSYMV_L_KERNEL = symv_L.S | |||||
| endif | |||||
| ifndef CSYMV_U_KERNEL | |||||
| CSYMV_U_KERNEL = zsymv_U.S | |||||
| endif | |||||
| ifndef CSYMV_L_KERNEL | |||||
| CSYMV_L_KERNEL = zsymv_L.S | |||||
| endif | |||||
| ifndef ZSYMV_U_KERNEL | |||||
| ZSYMV_U_KERNEL = zsymv_U.S | |||||
| endif | |||||
| ifndef ZSYMV_L_KERNEL | |||||
| ZSYMV_L_KERNEL = zsymv_L.S | |||||
| endif | |||||
| ifndef CHEMV_U_KERNEL | |||||
| CHEMV_U_KERNEL = zsymv_U.S | |||||
| endif | |||||
| ifndef CHEMV_L_KERNEL | |||||
| CHEMV_L_KERNEL = zsymv_L.S | |||||
| endif | |||||
| ifndef ZHEMV_U_KERNEL | |||||
| ZHEMV_U_KERNEL = zsymv_U.S | |||||
| endif | |||||
| ifndef ZHEMV_L_KERNEL | |||||
| ZHEMV_L_KERNEL = zsymv_L.S | |||||
| endif | |||||
| ifndef STRSMKERNEL_LN | ifndef STRSMKERNEL_LN | ||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | ||||
| endif | endif | ||||
| @@ -84,3 +30,19 @@ ifndef CTRSMKERNEL_RT | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | ||||
| endif | endif | ||||
| ifndef SGEMM_BETA | |||||
| SGEMM_BETA = gemm_beta.S | |||||
| endif | |||||
| ifndef DGEMM_BETA | |||||
| DGEMM_BETA = gemm_beta.S | |||||
| endif | |||||
| ifndef CGEMM_BETA | |||||
| CGEMM_BETA = zgemm_beta.S | |||||
| endif | |||||
| ifndef ZGEMM_BETA | |||||
| ZGEMM_BETA = zgemm_beta.S | |||||
| endif | |||||
| @@ -0,0 +1,175 @@ | |||||
| #SGEMM_BETA = ../generic/gemm_beta.c | |||||
| #DGEMM_BETA = ../generic/gemm_beta.c | |||||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||||
| #ZGEMM_BETA = ../generic/zgemm_beta.c | |||||
| STRMMKERNEL = gemm_kernel_power6.S | |||||
| DTRMMKERNEL = dtrmm_kernel_16x4_power8.S | |||||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||||
| ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S | |||||
| SGEMMKERNEL = gemm_kernel_power6.S | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| DGEMMKERNEL = dgemm_kernel_16x4_power8.S | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||||
| DGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||||
| DGEMMONCOPY = gemm_ncopy_4.S | |||||
| DGEMMOTCOPY = gemm_tcopy_4.S | |||||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy.o | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy.o | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| #Todo: CGEMM3MKERNEL should be 4x4 blocksizes. | |||||
| #CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S | |||||
| #ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S | |||||
| #Pure C for other kernels | |||||
| #SAMAXKERNEL = ../arm/amax.c | |||||
| #DAMAXKERNEL = ../arm/amax.c | |||||
| #CAMAXKERNEL = ../arm/zamax.c | |||||
| #ZAMAXKERNEL = ../arm/zamax.c | |||||
| # | |||||
| #SAMINKERNEL = ../arm/amin.c | |||||
| #DAMINKERNEL = ../arm/amin.c | |||||
| #CAMINKERNEL = ../arm/zamin.c | |||||
| #ZAMINKERNEL = ../arm/zamin.c | |||||
| # | |||||
| #SMAXKERNEL = ../arm/max.c | |||||
| #DMAXKERNEL = ../arm/max.c | |||||
| # | |||||
| #SMINKERNEL = ../arm/min.c | |||||
| #DMINKERNEL = ../arm/min.c | |||||
| # | |||||
| #ISAMAXKERNEL = ../arm/iamax.c | |||||
| #IDAMAXKERNEL = ../arm/iamax.c | |||||
| #ICAMAXKERNEL = ../arm/izamax.c | |||||
| #IZAMAXKERNEL = ../arm/izamax.c | |||||
| # | |||||
| #ISAMINKERNEL = ../arm/iamin.c | |||||
| #IDAMINKERNEL = ../arm/iamin.c | |||||
| #ICAMINKERNEL = ../arm/izamin.c | |||||
| #IZAMINKERNEL = ../arm/izamin.c | |||||
| # | |||||
| #ISMAXKERNEL = ../arm/imax.c | |||||
| #IDMAXKERNEL = ../arm/imax.c | |||||
| # | |||||
| #ISMINKERNEL = ../arm/imin.c | |||||
| #IDMINKERNEL = ../arm/imin.c | |||||
| # | |||||
| #SASUMKERNEL = ../arm/asum.c | |||||
| #DASUMKERNEL = ../arm/asum.c | |||||
| #CASUMKERNEL = ../arm/zasum.c | |||||
| #ZASUMKERNEL = ../arm/zasum.c | |||||
| # | |||||
| #SAXPYKERNEL = ../arm/axpy.c | |||||
| #DAXPYKERNEL = ../arm/axpy.c | |||||
| #CAXPYKERNEL = ../arm/zaxpy.c | |||||
| #ZAXPYKERNEL = ../arm/zaxpy.c | |||||
| # | |||||
| #SCOPYKERNEL = ../arm/copy.c | |||||
| #DCOPYKERNEL = ../arm/copy.c | |||||
| #CCOPYKERNEL = ../arm/zcopy.c | |||||
| #ZCOPYKERNEL = ../arm/zcopy.c | |||||
| # | |||||
| #SDOTKERNEL = ../arm/dot.c | |||||
| #DDOTKERNEL = ../arm/dot.c | |||||
| #CDOTKERNEL = ../arm/zdot.c | |||||
| #ZDOTKERNEL = ../arm/zdot.c | |||||
| # | |||||
| #SNRM2KERNEL = ../arm/nrm2.c | |||||
| #DNRM2KERNEL = ../arm/nrm2.c | |||||
| #CNRM2KERNEL = ../arm/znrm2.c | |||||
| #ZNRM2KERNEL = ../arm/znrm2.c | |||||
| # | |||||
| #SROTKERNEL = ../arm/rot.c | |||||
| #DROTKERNEL = ../arm/rot.c | |||||
| #CROTKERNEL = ../arm/zrot.c | |||||
| #ZROTKERNEL = ../arm/zrot.c | |||||
| # | |||||
| #SSCALKERNEL = ../arm/scal.c | |||||
| #DSCALKERNEL = ../arm/scal.c | |||||
| #CSCALKERNEL = ../arm/zscal.c | |||||
| #ZSCALKERNEL = ../arm/zscal.c | |||||
| # | |||||
| #SSWAPKERNEL = ../arm/swap.c | |||||
| #DSWAPKERNEL = ../arm/swap.c | |||||
| #CSWAPKERNEL = ../arm/zswap.c | |||||
| #ZSWAPKERNEL = ../arm/zswap.c | |||||
| # | |||||
| #SGEMVNKERNEL = ../arm/gemv_n.c | |||||
| #DGEMVNKERNEL = ../arm/gemv_n.c | |||||
| #CGEMVNKERNEL = ../arm/zgemv_n.c | |||||
| #ZGEMVNKERNEL = ../arm/zgemv_n.c | |||||
| # | |||||
| #SGEMVTKERNEL = ../arm/gemv_t.c | |||||
| #DGEMVTKERNEL = ../arm/gemv_t.c | |||||
| #CGEMVTKERNEL = ../arm/zgemv_t.c | |||||
| #ZGEMVTKERNEL = ../arm/zgemv_t.c | |||||
| #SSYMV_U_KERNEL = ../generic/symv_k.c | |||||
| #SSYMV_L_KERNEL = ../generic/symv_k.c | |||||
| #DSYMV_U_KERNEL = ../generic/symv_k.c | |||||
| #DSYMV_L_KERNEL = ../generic/symv_k.c | |||||
| #QSYMV_U_KERNEL = ../generic/symv_k.c | |||||
| #QSYMV_L_KERNEL = ../generic/symv_k.c | |||||
| #CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
| #CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
| #ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
| #ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
| #XSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
| #XSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
| #ZHEMV_U_KERNEL = ../generic/zhemv_k.c | |||||
| #ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||||
| LSAME_KERNEL = ../generic/lsame.c | |||||
| SCABS_KERNEL = ../generic/cabs.c | |||||
| DCABS_KERNEL = ../generic/cabs.c | |||||
| QCABS_KERNEL = ../generic/cabs.c | |||||
| #Dump kernel | |||||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||||
| @@ -0,0 +1,64 @@ | |||||
| #define vs0 0 | |||||
| #define vs1 1 | |||||
| #define vs2 2 | |||||
| #define vs3 3 | |||||
| #define vs4 4 | |||||
| #define vs5 5 | |||||
| #define vs6 6 | |||||
| #define vs7 7 | |||||
| #define vs8 8 | |||||
| #define vs9 9 | |||||
| #define vs10 10 | |||||
| #define vs11 11 | |||||
| #define vs12 12 | |||||
| #define vs13 13 | |||||
| #define vs14 14 | |||||
| #define vs15 15 | |||||
| #define vs16 16 | |||||
| #define vs17 17 | |||||
| #define vs18 18 | |||||
| #define vs19 19 | |||||
| #define vs20 20 | |||||
| #define vs21 21 | |||||
| #define vs22 22 | |||||
| #define vs23 23 | |||||
| #define vs24 24 | |||||
| #define vs25 25 | |||||
| #define vs26 26 | |||||
| #define vs27 27 | |||||
| #define vs28 28 | |||||
| #define vs29 29 | |||||
| #define vs30 30 | |||||
| #define vs31 31 | |||||
| #define vs32 32 | |||||
| #define vs33 33 | |||||
| #define vs34 34 | |||||
| #define vs35 35 | |||||
| #define vs36 36 | |||||
| #define vs37 37 | |||||
| #define vs38 38 | |||||
| #define vs39 39 | |||||
| #define vs40 40 | |||||
| #define vs41 41 | |||||
| #define vs42 42 | |||||
| #define vs43 43 | |||||
| #define vs44 44 | |||||
| #define vs45 45 | |||||
| #define vs46 46 | |||||
| #define vs47 47 | |||||
| #define vs48 48 | |||||
| #define vs49 49 | |||||
| #define vs50 50 | |||||
| #define vs51 51 | |||||
| #define vs52 52 | |||||
| #define vs53 53 | |||||
| #define vs54 54 | |||||
| #define vs55 55 | |||||
| #define vs56 56 | |||||
| #define vs57 57 | |||||
| #define vs58 58 | |||||
| #define vs59 59 | |||||
| #define vs60 60 | |||||
| #define vs61 61 | |||||
| #define vs62 62 | |||||
| #define vs63 63 | |||||
| @@ -0,0 +1,348 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2016/03/05 Werner Saar (wernsaar@googlemail.com) | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #include "def_vsx.h" | |||||
| #ifndef __64BIT__ | |||||
| #define LOAD lwz | |||||
| #else | |||||
| #define LOAD ld | |||||
| #endif | |||||
| #ifdef __64BIT__ | |||||
| #define STACKSIZE 320 | |||||
| #define ALPHA_SP 296(SP) | |||||
| #define FZERO 304(SP) | |||||
| #else | |||||
| #define STACKSIZE 240 | |||||
| #define ALPHA_SP 224(SP) | |||||
| #define FZERO 232(SP) | |||||
| #endif | |||||
| #define M r3 | |||||
| #define N r4 | |||||
| #define K r5 | |||||
| #ifdef linux | |||||
| #ifndef __64BIT__ | |||||
| #define A r6 | |||||
| #define B r7 | |||||
| #define C r8 | |||||
| #define LDC r9 | |||||
| #define OFFSET r10 | |||||
| #else | |||||
| #define A r7 | |||||
| #define B r8 | |||||
| #define C r9 | |||||
| #define LDC r10 | |||||
| #define OFFSET r6 | |||||
| #endif | |||||
| #endif | |||||
| #if defined(_AIX) || defined(__APPLE__) | |||||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||||
| #define A r8 | |||||
| #define B r9 | |||||
| #define C r10 | |||||
| #define LDC r7 | |||||
| #define OFFSET r6 | |||||
| #else | |||||
| #define A r7 | |||||
| #define B r8 | |||||
| #define C r9 | |||||
| #define LDC r10 | |||||
| #define OFFSET r6 | |||||
| #endif | |||||
| #endif | |||||
| #define alpha_r vs18 | |||||
| #define o0 0 | |||||
| #define o8 r15 | |||||
| #define o24 r16 | |||||
| #define ALPHA r17 | |||||
| #define L r18 | |||||
| #define T1 r19 | |||||
| #define KK r20 | |||||
| #define BB r21 | |||||
| #define I r22 | |||||
| #define J r23 | |||||
| #define AO r24 | |||||
| #define BO r25 | |||||
| #define CO r26 | |||||
| #define o16 r27 | |||||
| #define o32 r28 | |||||
| #define o48 r29 | |||||
| #define PRE r30 | |||||
| #define T2 r31 | |||||
| #include "dgemm_macros_16x4_power8.S" | |||||
| #ifndef NEEDPARAM | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| addi SP, SP, -STACKSIZE | |||||
| li r0, 0 | |||||
| stfd f14, 0(SP) | |||||
| stfd f15, 8(SP) | |||||
| stfd f16, 16(SP) | |||||
| stfd f17, 24(SP) | |||||
| stfd f18, 32(SP) | |||||
| stfd f19, 40(SP) | |||||
| stfd f20, 48(SP) | |||||
| stfd f21, 56(SP) | |||||
| stfd f22, 64(SP) | |||||
| stfd f23, 72(SP) | |||||
| stfd f24, 80(SP) | |||||
| stfd f25, 88(SP) | |||||
| stfd f26, 96(SP) | |||||
| stfd f27, 104(SP) | |||||
| stfd f28, 112(SP) | |||||
| stfd f29, 120(SP) | |||||
| stfd f30, 128(SP) | |||||
| stfd f31, 136(SP) | |||||
| #ifdef __64BIT__ | |||||
| std r31, 144(SP) | |||||
| std r30, 152(SP) | |||||
| std r29, 160(SP) | |||||
| std r28, 168(SP) | |||||
| std r27, 176(SP) | |||||
| std r26, 184(SP) | |||||
| std r25, 192(SP) | |||||
| std r24, 200(SP) | |||||
| std r23, 208(SP) | |||||
| std r22, 216(SP) | |||||
| std r21, 224(SP) | |||||
| std r20, 232(SP) | |||||
| std r19, 240(SP) | |||||
| std r18, 248(SP) | |||||
| std r17, 256(SP) | |||||
| std r16, 264(SP) | |||||
| std r15, 272(SP) | |||||
| #else | |||||
| stw r31, 144(SP) | |||||
| stw r30, 148(SP) | |||||
| stw r29, 152(SP) | |||||
| stw r28, 156(SP) | |||||
| stw r27, 160(SP) | |||||
| stw r26, 164(SP) | |||||
| stw r25, 168(SP) | |||||
| stw r24, 172(SP) | |||||
| stw r23, 176(SP) | |||||
| stw r22, 180(SP) | |||||
| stw r21, 184(SP) | |||||
| stw r20, 188(SP) | |||||
| stw r19, 192(SP) | |||||
| stw r18, 196(SP) | |||||
| stw r17, 200(SP) | |||||
| stw r16, 204(SP) | |||||
| stw r15, 208(SP) | |||||
| #endif | |||||
| stfd f1, ALPHA_SP | |||||
| stw r0, FZERO | |||||
| #if defined(_AIX) || defined(__APPLE__) | |||||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||||
| lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #endif | |||||
| #endif | |||||
| slwi LDC, LDC, BASE_SHIFT | |||||
| #if defined(TRMMKERNEL) | |||||
| #if defined(linux) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #endif | |||||
| #if defined(_AIX) || defined(__APPLE__) | |||||
| #ifdef __64BIT__ | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #else | |||||
| #ifdef DOUBLE | |||||
| lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||||
| #else | |||||
| lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #endif | |||||
| #endif | |||||
| #endif | |||||
| #endif | |||||
| cmpwi cr0, M, 0 | |||||
| ble .L999_H1 | |||||
| cmpwi cr0, N, 0 | |||||
| ble .L999_H1 | |||||
| cmpwi cr0, K, 0 | |||||
| ble .L999_H1 | |||||
| #ifdef __64BIT__ | |||||
| addi ALPHA, SP, 296 | |||||
| #else | |||||
| addi ALPHA, SP, 224 | |||||
| #endif | |||||
| li PRE, 256 | |||||
| li o8 , 8 | |||||
| li o16, 16 | |||||
| li o24, 24 | |||||
| li o32, 32 | |||||
| li o48, 48 | |||||
| lxvdsx alpha_r, 0, ALPHA | |||||
| #include "dgemm_logic_16x4_power8.S" | |||||
| .L999: | |||||
| addi r3, 0, 0 | |||||
| lfd f14, 0(SP) | |||||
| lfd f15, 8(SP) | |||||
| lfd f16, 16(SP) | |||||
| lfd f17, 24(SP) | |||||
| lfd f18, 32(SP) | |||||
| lfd f19, 40(SP) | |||||
| lfd f20, 48(SP) | |||||
| lfd f21, 56(SP) | |||||
| lfd f22, 64(SP) | |||||
| lfd f23, 72(SP) | |||||
| lfd f24, 80(SP) | |||||
| lfd f25, 88(SP) | |||||
| lfd f26, 96(SP) | |||||
| lfd f27, 104(SP) | |||||
| lfd f28, 112(SP) | |||||
| lfd f29, 120(SP) | |||||
| lfd f30, 128(SP) | |||||
| lfd f31, 136(SP) | |||||
| #ifdef __64BIT__ | |||||
| ld r31, 144(SP) | |||||
| ld r30, 152(SP) | |||||
| ld r29, 160(SP) | |||||
| ld r28, 168(SP) | |||||
| ld r27, 176(SP) | |||||
| ld r26, 184(SP) | |||||
| ld r25, 192(SP) | |||||
| ld r24, 200(SP) | |||||
| ld r23, 208(SP) | |||||
| ld r22, 216(SP) | |||||
| ld r21, 224(SP) | |||||
| ld r20, 232(SP) | |||||
| ld r19, 240(SP) | |||||
| ld r18, 248(SP) | |||||
| ld r17, 256(SP) | |||||
| ld r16, 264(SP) | |||||
| ld r15, 272(SP) | |||||
| #else | |||||
| lwz r31, 144(SP) | |||||
| lwz r30, 148(SP) | |||||
| lwz r29, 152(SP) | |||||
| lwz r28, 156(SP) | |||||
| lwz r27, 160(SP) | |||||
| lwz r26, 164(SP) | |||||
| lwz r25, 168(SP) | |||||
| lwz r24, 172(SP) | |||||
| lwz r23, 176(SP) | |||||
| lwz r22, 180(SP) | |||||
| lwz r21, 184(SP) | |||||
| lwz r20, 188(SP) | |||||
| lwz r19, 192(SP) | |||||
| lwz r18, 196(SP) | |||||
| lwz r17, 200(SP) | |||||
| lwz r16, 204(SP) | |||||
| lwz r15, 208(SP) | |||||
| #endif | |||||
| addi SP, SP, STACKSIZE | |||||
| blr | |||||
| EPILOGUE | |||||
| #endif | |||||
| @@ -0,0 +1,362 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2016/03/05 Werner Saar (wernsaar@googlemail.com) | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #include "def_vsx.h" | |||||
| #ifndef __64BIT__ | |||||
| #define LOAD lwz | |||||
| #else | |||||
| #define LOAD ld | |||||
| #endif | |||||
| #ifdef __64BIT__ | |||||
| #define STACKSIZE 320 | |||||
| #define ALPHA_SP 296(SP) | |||||
| #define FZERO 304(SP) | |||||
| #else | |||||
| #define STACKSIZE 240 | |||||
| #define ALPHA_SP 224(SP) | |||||
| #define FZERO 232(SP) | |||||
| #endif | |||||
| #define M r3 | |||||
| #define N r4 | |||||
| #define K r5 | |||||
| #ifdef linux | |||||
| #ifndef __64BIT__ | |||||
| #define A r6 | |||||
| #define B r7 | |||||
| #define C r8 | |||||
| #define LDC r9 | |||||
| #define OFFSET r10 | |||||
| #else | |||||
| #define A r7 | |||||
| #define B r8 | |||||
| #define C r9 | |||||
| #define LDC r10 | |||||
| #define OFFSET r6 | |||||
| #endif | |||||
| #endif | |||||
| #if defined(_AIX) || defined(__APPLE__) | |||||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||||
| #define A r8 | |||||
| #define B r9 | |||||
| #define C r10 | |||||
| #define LDC r7 | |||||
| #define OFFSET r6 | |||||
| #else | |||||
| #define A r7 | |||||
| #define B r8 | |||||
| #define C r9 | |||||
| #define LDC r10 | |||||
| #define OFFSET r6 | |||||
| #endif | |||||
| #endif | |||||
| #define alpha_r vs18 | |||||
| #define o0 0 | |||||
| #define K1 r13 | |||||
| #define KKK r14 | |||||
| #define o8 r15 | |||||
| #define o24 r16 | |||||
| #define ALPHA r17 | |||||
| #define L r18 | |||||
| #define T1 r19 | |||||
| #define KK r20 | |||||
| #define BB r21 | |||||
| #define I r22 | |||||
| #define J r23 | |||||
| #define AO r24 | |||||
| #define BO r25 | |||||
| #define CO r26 | |||||
| #define o16 r27 | |||||
| #define o32 r28 | |||||
| #define o48 r29 | |||||
| #define PRE r30 | |||||
| #define T2 r31 | |||||
| #include "dgemm_macros_16x4_power8.S" | |||||
| #ifndef NEEDPARAM | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| addi SP, SP, -STACKSIZE | |||||
| li r0, 0 | |||||
| stfd f14, 0(SP) | |||||
| stfd f15, 8(SP) | |||||
| stfd f16, 16(SP) | |||||
| stfd f17, 24(SP) | |||||
| stfd f18, 32(SP) | |||||
| stfd f19, 40(SP) | |||||
| stfd f20, 48(SP) | |||||
| stfd f21, 56(SP) | |||||
| stfd f22, 64(SP) | |||||
| stfd f23, 72(SP) | |||||
| stfd f24, 80(SP) | |||||
| stfd f25, 88(SP) | |||||
| stfd f26, 96(SP) | |||||
| stfd f27, 104(SP) | |||||
| stfd f28, 112(SP) | |||||
| stfd f29, 120(SP) | |||||
| stfd f30, 128(SP) | |||||
| stfd f31, 136(SP) | |||||
| #ifdef __64BIT__ | |||||
| std r31, 144(SP) | |||||
| std r30, 152(SP) | |||||
| std r29, 160(SP) | |||||
| std r28, 168(SP) | |||||
| std r27, 176(SP) | |||||
| std r26, 184(SP) | |||||
| std r25, 192(SP) | |||||
| std r24, 200(SP) | |||||
| std r23, 208(SP) | |||||
| std r22, 216(SP) | |||||
| std r21, 224(SP) | |||||
| std r20, 232(SP) | |||||
| std r19, 240(SP) | |||||
| std r18, 248(SP) | |||||
| std r17, 256(SP) | |||||
| std r16, 264(SP) | |||||
| std r15, 272(SP) | |||||
| std r14, 280(SP) | |||||
| std r13, 288(SP) | |||||
| #else | |||||
| stw r31, 144(SP) | |||||
| stw r30, 148(SP) | |||||
| stw r29, 152(SP) | |||||
| stw r28, 156(SP) | |||||
| stw r27, 160(SP) | |||||
| stw r26, 164(SP) | |||||
| stw r25, 168(SP) | |||||
| stw r24, 172(SP) | |||||
| stw r23, 176(SP) | |||||
| stw r22, 180(SP) | |||||
| stw r21, 184(SP) | |||||
| stw r20, 188(SP) | |||||
| stw r19, 192(SP) | |||||
| stw r18, 196(SP) | |||||
| stw r17, 200(SP) | |||||
| stw r16, 204(SP) | |||||
| stw r15, 208(SP) | |||||
| stw r14, 212(SP) | |||||
| stw r13, 216(SP) | |||||
| #endif | |||||
| stfd f1, ALPHA_SP | |||||
| stw r0, FZERO | |||||
| #if defined(_AIX) || defined(__APPLE__) | |||||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||||
| lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #endif | |||||
| #endif | |||||
| slwi LDC, LDC, BASE_SHIFT | |||||
| #if defined(TRMMKERNEL) | |||||
| #if defined(linux) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #endif | |||||
| #if defined(_AIX) || defined(__APPLE__) | |||||
| #ifdef __64BIT__ | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #else | |||||
| #ifdef DOUBLE | |||||
| lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||||
| #else | |||||
| lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #endif | |||||
| #endif | |||||
| #endif | |||||
| #endif | |||||
| mr KK, OFFSET | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| neg KK, KK | |||||
| #endif | |||||
| cmpwi cr0, M, 0 | |||||
| ble .L999_H1 | |||||
| cmpwi cr0, N, 0 | |||||
| ble .L999_H1 | |||||
| cmpwi cr0, K, 0 | |||||
| ble .L999_H1 | |||||
| #ifdef __64BIT__ | |||||
| addi ALPHA, SP, 296 | |||||
| #else | |||||
| addi ALPHA, SP, 224 | |||||
| #endif | |||||
| li PRE, 256 | |||||
| li o8 , 8 | |||||
| li o16, 16 | |||||
| li o24, 24 | |||||
| li o32, 32 | |||||
| li o48, 48 | |||||
| lxvdsx alpha_r, 0, ALPHA | |||||
| #include "dtrmm_logic_16x4_power8.S" | |||||
| .L999: | |||||
| addi r3, 0, 0 | |||||
| lfd f14, 0(SP) | |||||
| lfd f15, 8(SP) | |||||
| lfd f16, 16(SP) | |||||
| lfd f17, 24(SP) | |||||
| lfd f18, 32(SP) | |||||
| lfd f19, 40(SP) | |||||
| lfd f20, 48(SP) | |||||
| lfd f21, 56(SP) | |||||
| lfd f22, 64(SP) | |||||
| lfd f23, 72(SP) | |||||
| lfd f24, 80(SP) | |||||
| lfd f25, 88(SP) | |||||
| lfd f26, 96(SP) | |||||
| lfd f27, 104(SP) | |||||
| lfd f28, 112(SP) | |||||
| lfd f29, 120(SP) | |||||
| lfd f30, 128(SP) | |||||
| lfd f31, 136(SP) | |||||
| #ifdef __64BIT__ | |||||
| ld r31, 144(SP) | |||||
| ld r30, 152(SP) | |||||
| ld r29, 160(SP) | |||||
| ld r28, 168(SP) | |||||
| ld r27, 176(SP) | |||||
| ld r26, 184(SP) | |||||
| ld r25, 192(SP) | |||||
| ld r24, 200(SP) | |||||
| ld r23, 208(SP) | |||||
| ld r22, 216(SP) | |||||
| ld r21, 224(SP) | |||||
| ld r20, 232(SP) | |||||
| ld r19, 240(SP) | |||||
| ld r18, 248(SP) | |||||
| ld r17, 256(SP) | |||||
| ld r16, 264(SP) | |||||
| ld r15, 272(SP) | |||||
| ld r14, 280(SP) | |||||
| ld r13, 288(SP) | |||||
| #else | |||||
| lwz r31, 144(SP) | |||||
| lwz r30, 148(SP) | |||||
| lwz r29, 152(SP) | |||||
| lwz r28, 156(SP) | |||||
| lwz r27, 160(SP) | |||||
| lwz r26, 164(SP) | |||||
| lwz r25, 168(SP) | |||||
| lwz r24, 172(SP) | |||||
| lwz r23, 176(SP) | |||||
| lwz r22, 180(SP) | |||||
| lwz r21, 184(SP) | |||||
| lwz r20, 188(SP) | |||||
| lwz r19, 192(SP) | |||||
| lwz r18, 196(SP) | |||||
| lwz r17, 200(SP) | |||||
| lwz r16, 204(SP) | |||||
| lwz r15, 208(SP) | |||||
| lwz r14, 212(SP) | |||||
| lwz r13, 216(SP) | |||||
| #endif | |||||
| addi SP, SP, STACKSIZE | |||||
| blr | |||||
| EPILOGUE | |||||
| #endif | |||||
| @@ -107,6 +107,11 @@ | |||||
| #ifdef PPCG4 | #ifdef PPCG4 | ||||
| #define PREFETCHSIZE 16 | #define PREFETCHSIZE 16 | ||||
| #define PREFETCHWSIZE 72 | #define PREFETCHWSIZE 72 | ||||
| #endif | |||||
| #ifdef POWER8 | |||||
| #define PREFETCHSIZE 16 | |||||
| #define PREFETCHWSIZE 72 | |||||
| #endif | #endif | ||||
| PROLOGUE | PROLOGUE | ||||
| @@ -193,7 +198,7 @@ LL(12): | |||||
| STFD c12, 14 * SIZE(B) | STFD c12, 14 * SIZE(B) | ||||
| STFD c16, 15 * SIZE(B) | STFD c16, 15 * SIZE(B) | ||||
| #ifdef POWER6 | |||||
| #if defined(POWER6) || defined(POWER8) | |||||
| dcbtst PREA, AO1 | dcbtst PREA, AO1 | ||||
| dcbtst PREA, AO2 | dcbtst PREA, AO2 | ||||
| dcbtst PREA, AO3 | dcbtst PREA, AO3 | ||||
| @@ -111,6 +111,11 @@ | |||||
| #ifdef PPCG4 | #ifdef PPCG4 | ||||
| #define PREFETCHSIZE 16 | #define PREFETCHSIZE 16 | ||||
| #define PREFETCHWSIZE 48 | #define PREFETCHWSIZE 48 | ||||
| #endif | |||||
| #ifdef POWER8 | |||||
| #define PREFETCHSIZE 16 | |||||
| #define PREFETCHWSIZE 48 | |||||
| #endif | #endif | ||||
| PROLOGUE | PROLOGUE | ||||
| @@ -224,7 +229,7 @@ LL(12): | |||||
| STFD c15, 14 * SIZE(B1) | STFD c15, 14 * SIZE(B1) | ||||
| STFD c16, 15 * SIZE(B1) | STFD c16, 15 * SIZE(B1) | ||||
| #ifdef POWER6 | |||||
| #if defined(POWER6) || defined(POWER8) | |||||
| dcbtst PREA, AO1 | dcbtst PREA, AO1 | ||||
| dcbtst PREA, AO2 | dcbtst PREA, AO2 | ||||
| dcbtst PREA, AO3 | dcbtst PREA, AO3 | ||||
| @@ -174,6 +174,12 @@ | |||||
| #define PREFETCHSIZE_C 40 | #define PREFETCHSIZE_C 40 | ||||
| #endif | #endif | ||||
| #ifdef POWER8 | |||||
| #define PREFETCHSIZE_A 96 | |||||
| #define PREFETCHSIZE_C 40 | |||||
| #endif | |||||
| #ifndef NEEDPARAM | #ifndef NEEDPARAM | ||||
| #ifndef __64BIT__ | #ifndef __64BIT__ | ||||
| @@ -139,6 +139,11 @@ | |||||
| #define PREFETCHSIZE_C 8 | #define PREFETCHSIZE_C 8 | ||||
| #endif | #endif | ||||
| #ifdef POWER8 | |||||
| #define PREFETCHSIZE_A 96 | |||||
| #define PREFETCHSIZE_C 8 | |||||
| #endif | |||||
| #define y01 f0 | #define y01 f0 | ||||
| #define y02 f1 | #define y02 f1 | ||||
| #define y03 f2 | #define y03 f2 | ||||
| @@ -0,0 +1,367 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2016/03/05 Werner Saar (wernsaar@googlemail.com) | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #include "def_vsx.h" | |||||
| #ifndef __64BIT__ | |||||
| #define LOAD lwz | |||||
| #else | |||||
| #define LOAD ld | |||||
| #endif | |||||
| #ifdef __64BIT__ | |||||
| #define STACKSIZE 320 | |||||
| #define ALPHA_R_SP 296(SP) | |||||
| #define ALPHA_I_SP 304(SP) | |||||
| #define FZERO 312(SP) | |||||
| #else | |||||
| #define STACKSIZE 256 | |||||
| #define ALPHA_R_SP 224(SP) | |||||
| #define ALPHA_I_SP 232(SP) | |||||
| #define FZERO 240(SP) | |||||
| #endif | |||||
| #define M r3 | |||||
| #define N r4 | |||||
| #define K r5 | |||||
| #ifdef linux | |||||
| #ifndef __64BIT__ | |||||
| #define A r6 | |||||
| #define B r7 | |||||
| #define C r8 | |||||
| #define LDC r9 | |||||
| #define OFFSET r10 | |||||
| #else | |||||
| #define A r8 | |||||
| #define B r9 | |||||
| #define C r10 | |||||
| #define LDC r6 | |||||
| #define OFFSET r7 | |||||
| #endif | |||||
| #endif | |||||
| #if defined(_AIX) || defined(__APPLE__) | |||||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||||
| #define A r10 | |||||
| #define B r6 | |||||
| #define C r7 | |||||
| #define LDC r8 | |||||
| #define OFFSET r9 | |||||
| #else | |||||
| #define A r8 | |||||
| #define B r9 | |||||
| #define C r10 | |||||
| #define LDC r6 | |||||
| #define OFFSET r7 | |||||
| #endif | |||||
| #endif | |||||
| #define o0 0 | |||||
| #define alpha_r vs30 | |||||
| #define alpha_i vs31 | |||||
| #define L r15 | |||||
| #define ALPHA r16 | |||||
| #define o24 r17 | |||||
| #define T2 r19 | |||||
| #define KK r20 | |||||
| #define o8 r21 | |||||
| #define I r22 | |||||
| #define J r23 | |||||
| #define AO r24 | |||||
| #define BO r25 | |||||
| #define CO r26 | |||||
| #define o16 r27 | |||||
| #define o32 r28 | |||||
| #define o48 r29 | |||||
| #define PRE r30 | |||||
| #define T1 r31 | |||||
| #ifndef NEEDPARAM | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| addi SP, SP, -STACKSIZE | |||||
| li r0, 0 | |||||
| stfd f14, 0(SP) | |||||
| stfd f15, 8(SP) | |||||
| stfd f16, 16(SP) | |||||
| stfd f17, 24(SP) | |||||
| stfd f18, 32(SP) | |||||
| stfd f19, 40(SP) | |||||
| stfd f20, 48(SP) | |||||
| stfd f21, 56(SP) | |||||
| stfd f22, 64(SP) | |||||
| stfd f23, 72(SP) | |||||
| stfd f24, 80(SP) | |||||
| stfd f25, 88(SP) | |||||
| stfd f26, 96(SP) | |||||
| stfd f27, 104(SP) | |||||
| stfd f28, 112(SP) | |||||
| stfd f29, 120(SP) | |||||
| stfd f30, 128(SP) | |||||
| stfd f31, 136(SP) | |||||
| #ifdef __64BIT__ | |||||
| std r31, 144(SP) | |||||
| std r30, 152(SP) | |||||
| std r29, 160(SP) | |||||
| std r28, 168(SP) | |||||
| std r27, 176(SP) | |||||
| std r26, 184(SP) | |||||
| std r25, 192(SP) | |||||
| std r24, 200(SP) | |||||
| std r23, 208(SP) | |||||
| std r22, 216(SP) | |||||
| std r21, 224(SP) | |||||
| std r20, 232(SP) | |||||
| std r19, 240(SP) | |||||
| std r18, 248(SP) | |||||
| std r17, 256(SP) | |||||
| std r16, 264(SP) | |||||
| std r15, 272(SP) | |||||
| #else | |||||
| stw r31, 144(SP) | |||||
| stw r30, 148(SP) | |||||
| stw r29, 152(SP) | |||||
| stw r28, 156(SP) | |||||
| stw r27, 160(SP) | |||||
| stw r26, 164(SP) | |||||
| stw r25, 168(SP) | |||||
| stw r24, 172(SP) | |||||
| stw r23, 176(SP) | |||||
| stw r22, 180(SP) | |||||
| stw r21, 184(SP) | |||||
| stw r20, 188(SP) | |||||
| stw r19, 192(SP) | |||||
| stw r18, 196(SP) | |||||
| stw r17, 200(SP) | |||||
| stw r16, 204(SP) | |||||
| stw r15, 208(SP) | |||||
| #endif | |||||
| stfd f1, ALPHA_R_SP | |||||
| stfd f2, ALPHA_I_SP | |||||
| stw r0, FZERO | |||||
| #ifdef linux | |||||
| #ifdef __64BIT__ | |||||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #endif | |||||
| #endif | |||||
| #if defined(_AIX) || defined(__APPLE__) | |||||
| #ifdef __64BIT__ | |||||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #else | |||||
| #ifdef DOUBLE | |||||
| lwz B, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| lwz C, FRAMESLOT(1) + STACKSIZE(SP) | |||||
| lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) | |||||
| #else | |||||
| lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #endif | |||||
| #endif | |||||
| #endif | |||||
| #ifdef TRMMKERNEL | |||||
| #if defined(linux) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||||
| #endif | |||||
| #if defined(_AIX) || defined(__APPLE__) | |||||
| #ifdef __64BIT__ | |||||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||||
| #else | |||||
| #ifdef DOUBLE | |||||
| lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) | |||||
| #else | |||||
| lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||||
| #endif | |||||
| #endif | |||||
| #endif | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| neg KK, OFFSET | |||||
| #endif | |||||
| #endif | |||||
| #include "zgemm_macros_8x2_power8.S" | |||||
| cmpwi cr0, M, 0 | |||||
| ble .L999 | |||||
| cmpwi cr0, N, 0 | |||||
| ble .L999 | |||||
| cmpwi cr0, K, 0 | |||||
| ble .L999 | |||||
| slwi LDC, LDC, ZBASE_SHIFT | |||||
| li PRE, 256 | |||||
| li o8 , 8 | |||||
| li o16 , 16 | |||||
| li o24 , 24 | |||||
| li o32 , 32 | |||||
| li o48 , 48 | |||||
| #ifdef __64BIT__ | |||||
| addi ALPHA, SP, 296 | |||||
| #else | |||||
| addi ALPHA, SP, 224 | |||||
| #endif | |||||
| lxvdsx alpha_r, 0, ALPHA | |||||
| lxvdsx alpha_i, o8, ALPHA | |||||
| .align 5 | |||||
| #include "zgemm_logic_8x2_power8.S" | |||||
| .L999: | |||||
| addi r3, 0, 0 | |||||
| lfd f14, 0(SP) | |||||
| lfd f15, 8(SP) | |||||
| lfd f16, 16(SP) | |||||
| lfd f17, 24(SP) | |||||
| lfd f18, 32(SP) | |||||
| lfd f19, 40(SP) | |||||
| lfd f20, 48(SP) | |||||
| lfd f21, 56(SP) | |||||
| lfd f22, 64(SP) | |||||
| lfd f23, 72(SP) | |||||
| lfd f24, 80(SP) | |||||
| lfd f25, 88(SP) | |||||
| lfd f26, 96(SP) | |||||
| lfd f27, 104(SP) | |||||
| lfd f28, 112(SP) | |||||
| lfd f29, 120(SP) | |||||
| lfd f30, 128(SP) | |||||
| lfd f31, 136(SP) | |||||
| #ifdef __64BIT__ | |||||
| ld r31, 144(SP) | |||||
| ld r30, 152(SP) | |||||
| ld r29, 160(SP) | |||||
| ld r28, 168(SP) | |||||
| ld r27, 176(SP) | |||||
| ld r26, 184(SP) | |||||
| ld r25, 192(SP) | |||||
| ld r24, 200(SP) | |||||
| ld r23, 208(SP) | |||||
| ld r22, 216(SP) | |||||
| ld r21, 224(SP) | |||||
| ld r20, 232(SP) | |||||
| ld r19, 240(SP) | |||||
| ld r18, 248(SP) | |||||
| ld r17, 256(SP) | |||||
| ld r16, 264(SP) | |||||
| ld r15, 272(SP) | |||||
| #else | |||||
| lwz r31, 144(SP) | |||||
| lwz r30, 148(SP) | |||||
| lwz r29, 152(SP) | |||||
| lwz r28, 156(SP) | |||||
| lwz r27, 160(SP) | |||||
| lwz r26, 164(SP) | |||||
| lwz r25, 168(SP) | |||||
| lwz r24, 172(SP) | |||||
| lwz r23, 176(SP) | |||||
| lwz r22, 180(SP) | |||||
| lwz r21, 184(SP) | |||||
| lwz r20, 188(SP) | |||||
| lwz r19, 192(SP) | |||||
| lwz r18, 196(SP) | |||||
| lwz r17, 200(SP) | |||||
| lwz r16, 204(SP) | |||||
| lwz r15, 208(SP) | |||||
| #endif | |||||
| addi SP, SP, STACKSIZE | |||||
| blr | |||||
| EPILOGUE | |||||
| #endif | |||||
| @@ -0,0 +1,901 @@ | |||||
| srawi. J, N, 1 | |||||
| ble .LZGEMM_L2_END | |||||
| .LZGEMM_L2_BEGIN: | |||||
| mr CO, C | |||||
| mr AO, A | |||||
| slwi T1, LDC , 1 | |||||
| add C, C, T1 | |||||
| srawi. I, M, 3 | |||||
| ble .LZGEMM_L2x8_END | |||||
| .LZGEMM_L2x8_BEGIN: | |||||
| mr BO, B | |||||
| srawi. L, K, 3 | |||||
| ble .LZGEMM_L2x8_SUB0 | |||||
| cmpwi cr0, L, 1 | |||||
| ble .LZGEMM_L2x8_SUB4 | |||||
| .LZGEMM_L2x8_LOOP_START: | |||||
| dcbt AO, PRE | |||||
| LOAD2x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_I1 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | |||||
| addic. L, L, -2 | |||||
| ble .LZGEMM_L2x8_LOOP_END | |||||
| .align 5 | |||||
| .LZGEMM_L2x8_LOOP: | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L2x8_LOOP | |||||
| .LZGEMM_L2x8_LOOP_END: | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_1 | |||||
| KERNEL2x8_E2 | |||||
| b .LZGEMM_L2x8_SUB1 | |||||
| .LZGEMM_L2x8_SUB4: | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_SUBI1 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_SUB1 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_SUB1 | |||||
| dcbt AO, PRE | |||||
| KERNEL2x8_SUB1 | |||||
| KERNEL2x8_SUB1 | |||||
| KERNEL2x8_SUB1 | |||||
| KERNEL2x8_SUB1 | |||||
| KERNEL2x8_SUB1 | |||||
| b .LZGEMM_L2x8_SUB1 | |||||
| .LZGEMM_L2x8_SUB0: | |||||
| andi. L, K, 7 | |||||
| KERNEL2x8_SUBI1 | |||||
| addic. L, L, -1 | |||||
| ble .LZGEMM_L2x8_SAVE | |||||
| b .LZGEMM_L2x8_SUB2 | |||||
| .LZGEMM_L2x8_SUB1: | |||||
| andi. L, K, 7 | |||||
| ble .LZGEMM_L2x8_SAVE | |||||
| .LZGEMM_L2x8_SUB2: | |||||
| KERNEL2x8_SUB1 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L2x8_SUB2 | |||||
| .LZGEMM_L2x8_SAVE: | |||||
| SAVE2x8 | |||||
| addic. I, I, -1 | |||||
| bgt .LZGEMM_L2x8_BEGIN | |||||
| .LZGEMM_L2x8_END: | |||||
| .LZGEMM_L2x4_BEGIN: | |||||
| andi. T2, M, 7 | |||||
| ble .LZGEMM_L2x1_END | |||||
| andi. T1, M, 4 | |||||
| ble .LZGEMM_L2x4_END | |||||
| mr BO, B | |||||
| srawi. L, K, 3 | |||||
| ble .LZGEMM_L2x4_SUB0 | |||||
| cmpwi cr0, L, 1 | |||||
| ble .LZGEMM_L2x4_SUB4 | |||||
| .LZGEMM_L2x4_LOOP_START: | |||||
| LOAD2x4_1 | |||||
| KERNEL2x4_I1 | |||||
| KERNEL2x4_2 | |||||
| KERNEL2x4_1 | |||||
| KERNEL2x4_2 | |||||
| KERNEL2x4_1 | |||||
| KERNEL2x4_2 | |||||
| KERNEL2x4_1 | |||||
| KERNEL2x4_2 | |||||
| addic. L, L, -2 | |||||
| ble .LZGEMM_L2x4_LOOP_END | |||||
| .align 5 | |||||
| .LZGEMM_L2x4_LOOP: | |||||
| KERNEL2x4_1 | |||||
| KERNEL2x4_2 | |||||
| KERNEL2x4_1 | |||||
| KERNEL2x4_2 | |||||
| KERNEL2x4_1 | |||||
| KERNEL2x4_2 | |||||
| KERNEL2x4_1 | |||||
| KERNEL2x4_2 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L2x4_LOOP | |||||
| .LZGEMM_L2x4_LOOP_END: | |||||
| KERNEL2x4_1 | |||||
| KERNEL2x4_2 | |||||
| KERNEL2x4_1 | |||||
| KERNEL2x4_2 | |||||
| KERNEL2x4_1 | |||||
| KERNEL2x4_2 | |||||
| KERNEL2x4_1 | |||||
| KERNEL2x4_E2 | |||||
| b .LZGEMM_L2x4_SUB1 | |||||
| .LZGEMM_L2x4_SUB4: | |||||
| KERNEL2x4_SUBI1 | |||||
| KERNEL2x4_SUB1 | |||||
| KERNEL2x4_SUB1 | |||||
| KERNEL2x4_SUB1 | |||||
| KERNEL2x4_SUB1 | |||||
| KERNEL2x4_SUB1 | |||||
| KERNEL2x4_SUB1 | |||||
| KERNEL2x4_SUB1 | |||||
| b .LZGEMM_L2x4_SUB1 | |||||
| .LZGEMM_L2x4_SUB0: | |||||
| andi. L, K, 7 | |||||
| KERNEL2x4_SUBI1 | |||||
| addic. L, L, -1 | |||||
| ble .LZGEMM_L2x4_SAVE | |||||
| b .LZGEMM_L2x4_SUB2 | |||||
| .LZGEMM_L2x4_SUB1: | |||||
| andi. L, K, 7 | |||||
| ble .LZGEMM_L2x4_SAVE | |||||
| .LZGEMM_L2x4_SUB2: | |||||
| KERNEL2x4_SUB1 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L2x4_SUB2 | |||||
| .LZGEMM_L2x4_SAVE: | |||||
| SAVE2x4 | |||||
| .LZGEMM_L2x4_END: | |||||
| .LZGEMM_L2x2_BEGIN: | |||||
| andi. T1, M, 2 | |||||
| ble .LZGEMM_L2x2_END | |||||
| mr BO, B | |||||
| srawi. L, K, 3 | |||||
| ble .LZGEMM_L2x2_SUB0 | |||||
| cmpwi cr0, L, 1 | |||||
| ble .LZGEMM_L2x2_SUB4 | |||||
| .LZGEMM_L2x2_LOOP_START: | |||||
| LOAD2x2_1 | |||||
| KERNEL2x2_I1 | |||||
| KERNEL2x2_2 | |||||
| KERNEL2x2_1 | |||||
| KERNEL2x2_2 | |||||
| KERNEL2x2_1 | |||||
| KERNEL2x2_2 | |||||
| KERNEL2x2_1 | |||||
| KERNEL2x2_2 | |||||
| addic. L, L, -2 | |||||
| ble .LZGEMM_L2x2_LOOP_END | |||||
| .align 5 | |||||
| .LZGEMM_L2x2_LOOP: | |||||
| KERNEL2x2_1 | |||||
| KERNEL2x2_2 | |||||
| KERNEL2x2_1 | |||||
| KERNEL2x2_2 | |||||
| KERNEL2x2_1 | |||||
| KERNEL2x2_2 | |||||
| KERNEL2x2_1 | |||||
| KERNEL2x2_2 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L2x2_LOOP | |||||
| .LZGEMM_L2x2_LOOP_END: | |||||
| KERNEL2x2_1 | |||||
| KERNEL2x2_2 | |||||
| KERNEL2x2_1 | |||||
| KERNEL2x2_2 | |||||
| KERNEL2x2_1 | |||||
| KERNEL2x2_2 | |||||
| KERNEL2x2_1 | |||||
| KERNEL2x2_E2 | |||||
| b .LZGEMM_L2x2_SUB1 | |||||
| .LZGEMM_L2x2_SUB4: | |||||
| KERNEL2x2_SUBI1 | |||||
| KERNEL2x2_SUB1 | |||||
| KERNEL2x2_SUB1 | |||||
| KERNEL2x2_SUB1 | |||||
| KERNEL2x2_SUB1 | |||||
| KERNEL2x2_SUB1 | |||||
| KERNEL2x2_SUB1 | |||||
| KERNEL2x2_SUB1 | |||||
| b .LZGEMM_L2x2_SUB1 | |||||
| .LZGEMM_L2x2_SUB0: | |||||
| andi. L, K, 7 | |||||
| KERNEL2x2_SUBI1 | |||||
| addic. L, L, -1 | |||||
| ble .LZGEMM_L2x2_SAVE | |||||
| b .LZGEMM_L2x2_SUB2 | |||||
| .LZGEMM_L2x2_SUB1: | |||||
| andi. L, K, 7 | |||||
| ble .LZGEMM_L2x2_SAVE | |||||
| .LZGEMM_L2x2_SUB2: | |||||
| KERNEL2x2_SUB1 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L2x2_SUB2 | |||||
| .LZGEMM_L2x2_SAVE: | |||||
| SAVE2x2 | |||||
| .LZGEMM_L2x2_END: | |||||
| .LZGEMM_L2x1_BEGIN: | |||||
| andi. T1, M, 1 | |||||
| ble .LZGEMM_L2x1_END | |||||
| mr BO, B | |||||
| srawi. L, K, 3 | |||||
| ble .LZGEMM_L2x1_SUB0 | |||||
| cmpwi cr0, L, 1 | |||||
| ble .LZGEMM_L2x1_SUB4 | |||||
| .LZGEMM_L2x1_LOOP_START: | |||||
| LOAD2x1_1 | |||||
| KERNEL2x1_I1 | |||||
| KERNEL2x1_2 | |||||
| KERNEL2x1_1 | |||||
| KERNEL2x1_2 | |||||
| KERNEL2x1_1 | |||||
| KERNEL2x1_2 | |||||
| KERNEL2x1_1 | |||||
| KERNEL2x1_2 | |||||
| addic. L, L, -2 | |||||
| ble .LZGEMM_L2x1_LOOP_END | |||||
| .align 5 | |||||
| .LZGEMM_L2x1_LOOP: | |||||
| KERNEL2x1_1 | |||||
| KERNEL2x1_2 | |||||
| KERNEL2x1_1 | |||||
| KERNEL2x1_2 | |||||
| KERNEL2x1_1 | |||||
| KERNEL2x1_2 | |||||
| KERNEL2x1_1 | |||||
| KERNEL2x1_2 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L2x1_LOOP | |||||
| .LZGEMM_L2x1_LOOP_END: | |||||
| KERNEL2x1_1 | |||||
| KERNEL2x1_2 | |||||
| KERNEL2x1_1 | |||||
| KERNEL2x1_2 | |||||
| KERNEL2x1_1 | |||||
| KERNEL2x1_2 | |||||
| KERNEL2x1_1 | |||||
| KERNEL2x1_E2 | |||||
| b .LZGEMM_L2x1_SUB1 | |||||
| .LZGEMM_L2x1_SUB4: | |||||
| KERNEL2x1_SUBI1 | |||||
| KERNEL2x1_SUB1 | |||||
| KERNEL2x1_SUB1 | |||||
| KERNEL2x1_SUB1 | |||||
| KERNEL2x1_SUB1 | |||||
| KERNEL2x1_SUB1 | |||||
| KERNEL2x1_SUB1 | |||||
| KERNEL2x1_SUB1 | |||||
| b .LZGEMM_L2x1_SUB1 | |||||
| .LZGEMM_L2x1_SUB0: | |||||
| andi. L, K, 7 | |||||
| KERNEL2x1_SUBI1 | |||||
| addic. L, L, -1 | |||||
| ble .LZGEMM_L2x1_SAVE | |||||
| b .LZGEMM_L2x1_SUB2 | |||||
| .LZGEMM_L2x1_SUB1: | |||||
| andi. L, K, 7 | |||||
| ble .LZGEMM_L2x1_SAVE | |||||
| .LZGEMM_L2x1_SUB2: | |||||
| KERNEL2x1_SUB1 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L2x1_SUB2 | |||||
| .LZGEMM_L2x1_SAVE: | |||||
| SAVE2x1 | |||||
| .LZGEMM_L2x1_END: | |||||
| slwi T1, K, 5 | |||||
| add B, B, T1 | |||||
| addic. J, J, -1 | |||||
| bgt .LZGEMM_L2_BEGIN | |||||
| andi. T2, N, 1 | |||||
| ble .L999 | |||||
| .LZGEMM_L2_END: | |||||
| b .LZGEMM_L1_BEGIN | |||||
| .L999_H1: | |||||
| b .L999 | |||||
| .LZGEMM_L1_BEGIN: | |||||
| andi. T1, N, 1 | |||||
| ble .LZGEMM_L1_END | |||||
| mr CO, C | |||||
| mr AO, A | |||||
| srawi. I, M, 3 | |||||
| ble .LZGEMM_L1x8_END | |||||
| .LZGEMM_L1x8_BEGIN: | |||||
| mr BO, B | |||||
| srawi. L, K, 3 | |||||
| ble .LZGEMM_L1x8_SUB0 | |||||
| cmpwi cr0, L, 1 | |||||
| ble .LZGEMM_L1x8_SUB4 | |||||
| .LZGEMM_L1x8_LOOP_START: | |||||
| dcbt AO, PRE | |||||
| LOAD1x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_I1 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | |||||
| addic. L, L, -2 | |||||
| ble .LZGEMM_L1x8_LOOP_END | |||||
| .align 5 | |||||
| .LZGEMM_L1x8_LOOP: | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L1x8_LOOP | |||||
| .LZGEMM_L1x8_LOOP_END: | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_1 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_2 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_1 | |||||
| KERNEL1x8_E2 | |||||
| b .LZGEMM_L1x8_SUB1 | |||||
| .LZGEMM_L1x8_SUB4: | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_SUBI1 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_SUB1 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_SUB1 | |||||
| dcbt AO, PRE | |||||
| KERNEL1x8_SUB1 | |||||
| KERNEL1x8_SUB1 | |||||
| KERNEL1x8_SUB1 | |||||
| KERNEL1x8_SUB1 | |||||
| KERNEL1x8_SUB1 | |||||
| b .LZGEMM_L1x8_SUB1 | |||||
| .LZGEMM_L1x8_SUB0: | |||||
| andi. L, K, 7 | |||||
| KERNEL1x8_SUBI1 | |||||
| addic. L, L, -1 | |||||
| ble .LZGEMM_L1x8_SAVE | |||||
| b .LZGEMM_L1x8_SUB2 | |||||
| .LZGEMM_L1x8_SUB1: | |||||
| andi. L, K, 7 | |||||
| ble .LZGEMM_L1x8_SAVE | |||||
| .LZGEMM_L1x8_SUB2: | |||||
| KERNEL1x8_SUB1 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L1x8_SUB2 | |||||
| .LZGEMM_L1x8_SAVE: | |||||
| SAVE1x8 | |||||
| addic. I, I, -1 | |||||
| bgt .LZGEMM_L1x8_BEGIN | |||||
| .LZGEMM_L1x8_END: | |||||
| .LZGEMM_L1x4_BEGIN: | |||||
| andi. T2, M, 7 | |||||
| ble .LZGEMM_L1x1_END | |||||
| andi. T1, M, 4 | |||||
| ble .LZGEMM_L1x4_END | |||||
| mr BO, B | |||||
| srawi. L, K, 3 | |||||
| ble .LZGEMM_L1x4_SUB0 | |||||
| cmpwi cr0, L, 1 | |||||
| ble .LZGEMM_L1x4_SUB4 | |||||
| .LZGEMM_L1x4_LOOP_START: | |||||
| LOAD1x4_1 | |||||
| KERNEL1x4_I1 | |||||
| KERNEL1x4_2 | |||||
| KERNEL1x4_1 | |||||
| KERNEL1x4_2 | |||||
| KERNEL1x4_1 | |||||
| KERNEL1x4_2 | |||||
| KERNEL1x4_1 | |||||
| KERNEL1x4_2 | |||||
| addic. L, L, -2 | |||||
| ble .LZGEMM_L1x4_LOOP_END | |||||
| .align 5 | |||||
| .LZGEMM_L1x4_LOOP: | |||||
| KERNEL1x4_1 | |||||
| KERNEL1x4_2 | |||||
| KERNEL1x4_1 | |||||
| KERNEL1x4_2 | |||||
| KERNEL1x4_1 | |||||
| KERNEL1x4_2 | |||||
| KERNEL1x4_1 | |||||
| KERNEL1x4_2 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L1x4_LOOP | |||||
| .LZGEMM_L1x4_LOOP_END: | |||||
| KERNEL1x4_1 | |||||
| KERNEL1x4_2 | |||||
| KERNEL1x4_1 | |||||
| KERNEL1x4_2 | |||||
| KERNEL1x4_1 | |||||
| KERNEL1x4_2 | |||||
| KERNEL1x4_1 | |||||
| KERNEL1x4_E2 | |||||
| b .LZGEMM_L1x4_SUB1 | |||||
| .LZGEMM_L1x4_SUB4: | |||||
| KERNEL1x4_SUBI1 | |||||
| KERNEL1x4_SUB1 | |||||
| KERNEL1x4_SUB1 | |||||
| KERNEL1x4_SUB1 | |||||
| KERNEL1x4_SUB1 | |||||
| KERNEL1x4_SUB1 | |||||
| KERNEL1x4_SUB1 | |||||
| KERNEL1x4_SUB1 | |||||
| b .LZGEMM_L1x4_SUB1 | |||||
| .LZGEMM_L1x4_SUB0: | |||||
| andi. L, K, 7 | |||||
| KERNEL1x4_SUBI1 | |||||
| addic. L, L, -1 | |||||
| ble .LZGEMM_L1x4_SAVE | |||||
| b .LZGEMM_L1x4_SUB2 | |||||
| .LZGEMM_L1x4_SUB1: | |||||
| andi. L, K, 7 | |||||
| ble .LZGEMM_L1x4_SAVE | |||||
| .LZGEMM_L1x4_SUB2: | |||||
| KERNEL1x4_SUB1 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L1x4_SUB2 | |||||
| .LZGEMM_L1x4_SAVE: | |||||
| SAVE1x4 | |||||
| .LZGEMM_L1x4_END: | |||||
| .LZGEMM_L1x2_BEGIN: | |||||
| andi. T1, M, 2 | |||||
| ble .LZGEMM_L1x2_END | |||||
| mr BO, B | |||||
| srawi. L, K, 3 | |||||
| ble .LZGEMM_L1x2_SUB0 | |||||
| cmpwi cr0, L, 1 | |||||
| ble .LZGEMM_L1x2_SUB4 | |||||
| .LZGEMM_L1x2_LOOP_START: | |||||
| LOAD1x2_1 | |||||
| KERNEL1x2_I1 | |||||
| KERNEL1x2_2 | |||||
| KERNEL1x2_1 | |||||
| KERNEL1x2_2 | |||||
| KERNEL1x2_1 | |||||
| KERNEL1x2_2 | |||||
| KERNEL1x2_1 | |||||
| KERNEL1x2_2 | |||||
| addic. L, L, -2 | |||||
| ble .LZGEMM_L1x2_LOOP_END | |||||
| .align 5 | |||||
| .LZGEMM_L1x2_LOOP: | |||||
| KERNEL1x2_1 | |||||
| KERNEL1x2_2 | |||||
| KERNEL1x2_1 | |||||
| KERNEL1x2_2 | |||||
| KERNEL1x2_1 | |||||
| KERNEL1x2_2 | |||||
| KERNEL1x2_1 | |||||
| KERNEL1x2_2 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L1x2_LOOP | |||||
| .LZGEMM_L1x2_LOOP_END: | |||||
| KERNEL1x2_1 | |||||
| KERNEL1x2_2 | |||||
| KERNEL1x2_1 | |||||
| KERNEL1x2_2 | |||||
| KERNEL1x2_1 | |||||
| KERNEL1x2_2 | |||||
| KERNEL1x2_1 | |||||
| KERNEL1x2_E2 | |||||
| b .LZGEMM_L1x2_SUB1 | |||||
| .LZGEMM_L1x2_SUB4: | |||||
| KERNEL1x2_SUBI1 | |||||
| KERNEL1x2_SUB1 | |||||
| KERNEL1x2_SUB1 | |||||
| KERNEL1x2_SUB1 | |||||
| KERNEL1x2_SUB1 | |||||
| KERNEL1x2_SUB1 | |||||
| KERNEL1x2_SUB1 | |||||
| KERNEL1x2_SUB1 | |||||
| b .LZGEMM_L1x2_SUB1 | |||||
| .LZGEMM_L1x2_SUB0: | |||||
| andi. L, K, 7 | |||||
| KERNEL1x2_SUBI1 | |||||
| addic. L, L, -1 | |||||
| ble .LZGEMM_L1x2_SAVE | |||||
| b .LZGEMM_L1x2_SUB2 | |||||
| .LZGEMM_L1x2_SUB1: | |||||
| andi. L, K, 7 | |||||
| ble .LZGEMM_L1x2_SAVE | |||||
| .LZGEMM_L1x2_SUB2: | |||||
| KERNEL1x2_SUB1 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L1x2_SUB2 | |||||
| .LZGEMM_L1x2_SAVE: | |||||
| SAVE1x2 | |||||
| .LZGEMM_L1x2_END: | |||||
| .LZGEMM_L1x1_BEGIN: | |||||
| andi. T1, M, 1 | |||||
| ble .LZGEMM_L1x1_END | |||||
| mr BO, B | |||||
| srawi. L, K, 3 | |||||
| ble .LZGEMM_L1x1_SUB0 | |||||
| cmpwi cr0, L, 1 | |||||
| ble .LZGEMM_L1x1_SUB4 | |||||
| .LZGEMM_L1x1_LOOP_START: | |||||
| LOAD1x1_1 | |||||
| KERNEL1x1_I1 | |||||
| KERNEL1x1_2 | |||||
| KERNEL1x1_1 | |||||
| KERNEL1x1_2 | |||||
| KERNEL1x1_1 | |||||
| KERNEL1x1_2 | |||||
| KERNEL1x1_1 | |||||
| KERNEL1x1_2 | |||||
| addic. L, L, -2 | |||||
| ble .LZGEMM_L1x1_LOOP_END | |||||
| .align 5 | |||||
| .LZGEMM_L1x1_LOOP: | |||||
| KERNEL1x1_1 | |||||
| KERNEL1x1_2 | |||||
| KERNEL1x1_1 | |||||
| KERNEL1x1_2 | |||||
| KERNEL1x1_1 | |||||
| KERNEL1x1_2 | |||||
| KERNEL1x1_1 | |||||
| KERNEL1x1_2 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L1x1_LOOP | |||||
| .LZGEMM_L1x1_LOOP_END: | |||||
| KERNEL1x1_1 | |||||
| KERNEL1x1_2 | |||||
| KERNEL1x1_1 | |||||
| KERNEL1x1_2 | |||||
| KERNEL1x1_1 | |||||
| KERNEL1x1_2 | |||||
| KERNEL1x1_1 | |||||
| KERNEL1x1_E2 | |||||
| b .LZGEMM_L1x1_SUB1 | |||||
| .LZGEMM_L1x1_SUB4: | |||||
| KERNEL1x1_SUBI1 | |||||
| KERNEL1x1_SUB1 | |||||
| KERNEL1x1_SUB1 | |||||
| KERNEL1x1_SUB1 | |||||
| KERNEL1x1_SUB1 | |||||
| KERNEL1x1_SUB1 | |||||
| KERNEL1x1_SUB1 | |||||
| KERNEL1x1_SUB1 | |||||
| b .LZGEMM_L1x1_SUB1 | |||||
| .LZGEMM_L1x1_SUB0: | |||||
| andi. L, K, 7 | |||||
| KERNEL1x1_SUBI1 | |||||
| addic. L, L, -1 | |||||
| ble .LZGEMM_L1x1_SAVE | |||||
| b .LZGEMM_L1x1_SUB2 | |||||
| .LZGEMM_L1x1_SUB1: | |||||
| andi. L, K, 7 | |||||
| ble .LZGEMM_L1x1_SAVE | |||||
| .LZGEMM_L1x1_SUB2: | |||||
| KERNEL1x1_SUB1 | |||||
| addic. L, L, -1 | |||||
| bgt .LZGEMM_L1x1_SUB2 | |||||
| .LZGEMM_L1x1_SAVE: | |||||
| SAVE1x1 | |||||
| .LZGEMM_L1x1_END: | |||||
| .LZGEMM_L1_END: | |||||
| @@ -170,6 +170,11 @@ | |||||
| #define PREFETCHSIZE_C 24 | #define PREFETCHSIZE_C 24 | ||||
| #endif | #endif | ||||
| #ifdef POWER8 | |||||
| #define PREFETCHSIZE_A 24 | |||||
| #define PREFETCHSIZE_C 24 | |||||
| #endif | |||||
| #ifndef XCONJ | #ifndef XCONJ | ||||
| #define FMADDR FMADD | #define FMADDR FMADD | ||||
| #define FMSUBR FNMSUB | #define FMSUBR FNMSUB | ||||
| @@ -144,6 +144,12 @@ | |||||
| #define PREFETCHSIZE_C 8 | #define PREFETCHSIZE_C 8 | ||||
| #endif | #endif | ||||
| #ifdef POWER8 | |||||
| #define PREFETCHSIZE_A 24 | |||||
| #define PREFETCHSIZE_C 8 | |||||
| #endif | |||||
| #if !(defined(CONJ) && defined(XCONJ)) | #if !(defined(CONJ) && defined(XCONJ)) | ||||
| #define FMADDR FMADD | #define FMADDR FMADD | ||||
| #define FMSUBR FNMSUB | #define FMSUBR FNMSUB | ||||
| @@ -0,0 +1,377 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2016/03/05 Werner Saar (wernsaar@googlemail.com) | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #include "def_vsx.h" | |||||
| #ifndef __64BIT__ | |||||
| #define LOAD lwz | |||||
| #else | |||||
| #define LOAD ld | |||||
| #endif | |||||
| #ifdef __64BIT__ | |||||
| #define STACKSIZE 320 | |||||
| #define ALPHA_R_SP 296(SP) | |||||
| #define ALPHA_I_SP 304(SP) | |||||
| #define FZERO 312(SP) | |||||
| #else | |||||
| #define STACKSIZE 256 | |||||
| #define ALPHA_R_SP 224(SP) | |||||
| #define ALPHA_I_SP 232(SP) | |||||
| #define FZERO 240(SP) | |||||
| #endif | |||||
| #define M r3 | |||||
| #define N r4 | |||||
| #define K r5 | |||||
| #ifdef linux | |||||
| #ifndef __64BIT__ | |||||
| #define A r6 | |||||
| #define B r7 | |||||
| #define C r8 | |||||
| #define LDC r9 | |||||
| #define OFFSET r10 | |||||
| #else | |||||
| #define A r8 | |||||
| #define B r9 | |||||
| #define C r10 | |||||
| #define LDC r6 | |||||
| #define OFFSET r7 | |||||
| #endif | |||||
| #endif | |||||
| #if defined(_AIX) || defined(__APPLE__) | |||||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||||
| #define A r10 | |||||
| #define B r6 | |||||
| #define C r7 | |||||
| #define LDC r8 | |||||
| #define OFFSET r9 | |||||
| #else | |||||
| #define A r8 | |||||
| #define B r9 | |||||
| #define C r10 | |||||
| #define LDC r6 | |||||
| #define OFFSET r7 | |||||
| #endif | |||||
| #endif | |||||
| #define o0 0 | |||||
| #define alpha_r vs30 | |||||
| #define alpha_i vs31 | |||||
| #define KKK r13 | |||||
| #define K1 r14 | |||||
| #define L r15 | |||||
| #define ALPHA r16 | |||||
| #define o24 r17 | |||||
| #define T2 r19 | |||||
| #define KK r20 | |||||
| #define o8 r21 | |||||
| #define I r22 | |||||
| #define J r23 | |||||
| #define AO r24 | |||||
| #define BO r25 | |||||
| #define CO r26 | |||||
| #define o16 r27 | |||||
| #define o32 r28 | |||||
| #define o48 r29 | |||||
| #define PRE r30 | |||||
| #define T1 r31 | |||||
| #ifndef NEEDPARAM | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| addi SP, SP, -STACKSIZE | |||||
| li r0, 0 | |||||
| stfd f14, 0(SP) | |||||
| stfd f15, 8(SP) | |||||
| stfd f16, 16(SP) | |||||
| stfd f17, 24(SP) | |||||
| stfd f18, 32(SP) | |||||
| stfd f19, 40(SP) | |||||
| stfd f20, 48(SP) | |||||
| stfd f21, 56(SP) | |||||
| stfd f22, 64(SP) | |||||
| stfd f23, 72(SP) | |||||
| stfd f24, 80(SP) | |||||
| stfd f25, 88(SP) | |||||
| stfd f26, 96(SP) | |||||
| stfd f27, 104(SP) | |||||
| stfd f28, 112(SP) | |||||
| stfd f29, 120(SP) | |||||
| stfd f30, 128(SP) | |||||
| stfd f31, 136(SP) | |||||
| #ifdef __64BIT__ | |||||
| std r31, 144(SP) | |||||
| std r30, 152(SP) | |||||
| std r29, 160(SP) | |||||
| std r28, 168(SP) | |||||
| std r27, 176(SP) | |||||
| std r26, 184(SP) | |||||
| std r25, 192(SP) | |||||
| std r24, 200(SP) | |||||
| std r23, 208(SP) | |||||
| std r22, 216(SP) | |||||
| std r21, 224(SP) | |||||
| std r20, 232(SP) | |||||
| std r19, 240(SP) | |||||
| std r18, 248(SP) | |||||
| std r17, 256(SP) | |||||
| std r16, 264(SP) | |||||
| std r15, 272(SP) | |||||
| std r14, 280(SP) | |||||
| std r13, 288(SP) | |||||
| #else | |||||
| stw r31, 144(SP) | |||||
| stw r30, 148(SP) | |||||
| stw r29, 152(SP) | |||||
| stw r28, 156(SP) | |||||
| stw r27, 160(SP) | |||||
| stw r26, 164(SP) | |||||
| stw r25, 168(SP) | |||||
| stw r24, 172(SP) | |||||
| stw r23, 176(SP) | |||||
| stw r22, 180(SP) | |||||
| stw r21, 184(SP) | |||||
| stw r20, 188(SP) | |||||
| stw r19, 192(SP) | |||||
| stw r18, 196(SP) | |||||
| stw r17, 200(SP) | |||||
| stw r16, 204(SP) | |||||
| stw r15, 208(SP) | |||||
| stw r14, 212(SP) | |||||
| stw r13, 216(SP) | |||||
| #endif | |||||
| stfd f1, ALPHA_R_SP | |||||
| stfd f2, ALPHA_I_SP | |||||
| stw r0, FZERO | |||||
| #ifdef linux | |||||
| #ifdef __64BIT__ | |||||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #endif | |||||
| #endif | |||||
| #if defined(_AIX) || defined(__APPLE__) | |||||
| #ifdef __64BIT__ | |||||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #else | |||||
| #ifdef DOUBLE | |||||
| lwz B, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| lwz C, FRAMESLOT(1) + STACKSIZE(SP) | |||||
| lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) | |||||
| #else | |||||
| lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #endif | |||||
| #endif | |||||
| #endif | |||||
| #ifdef TRMMKERNEL | |||||
| #if defined(linux) && defined(__64BIT__) | |||||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||||
| #endif | |||||
| #if defined(_AIX) || defined(__APPLE__) | |||||
| #ifdef __64BIT__ | |||||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||||
| #else | |||||
| #ifdef DOUBLE | |||||
| lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) | |||||
| #else | |||||
| lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||||
| #endif | |||||
| #endif | |||||
| #endif | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| neg KK, OFFSET | |||||
| #endif | |||||
| #endif | |||||
| #include "zgemm_macros_8x2_power8.S" | |||||
| cmpwi cr0, M, 0 | |||||
| ble .L999 | |||||
| cmpwi cr0, N, 0 | |||||
| ble .L999 | |||||
| cmpwi cr0, K, 0 | |||||
| ble .L999 | |||||
| slwi LDC, LDC, ZBASE_SHIFT | |||||
| li PRE, 256 | |||||
| li o8 , 8 | |||||
| li o16 , 16 | |||||
| li o24 , 24 | |||||
| li o32 , 32 | |||||
| li o48 , 48 | |||||
| #ifdef __64BIT__ | |||||
| addi ALPHA, SP, 296 | |||||
| #else | |||||
| addi ALPHA, SP, 224 | |||||
| #endif | |||||
| lxsdx alpha_r, 0, ALPHA | |||||
| lxsdx alpha_i, o8, ALPHA | |||||
| .align 4 | |||||
| #include "ztrmm_logic_8x2_power8.S" | |||||
| .L999: | |||||
| addi r3, 0, 0 | |||||
| lfd f14, 0(SP) | |||||
| lfd f15, 8(SP) | |||||
| lfd f16, 16(SP) | |||||
| lfd f17, 24(SP) | |||||
| lfd f18, 32(SP) | |||||
| lfd f19, 40(SP) | |||||
| lfd f20, 48(SP) | |||||
| lfd f21, 56(SP) | |||||
| lfd f22, 64(SP) | |||||
| lfd f23, 72(SP) | |||||
| lfd f24, 80(SP) | |||||
| lfd f25, 88(SP) | |||||
| lfd f26, 96(SP) | |||||
| lfd f27, 104(SP) | |||||
| lfd f28, 112(SP) | |||||
| lfd f29, 120(SP) | |||||
| lfd f30, 128(SP) | |||||
| lfd f31, 136(SP) | |||||
| #ifdef __64BIT__ | |||||
| ld r31, 144(SP) | |||||
| ld r30, 152(SP) | |||||
| ld r29, 160(SP) | |||||
| ld r28, 168(SP) | |||||
| ld r27, 176(SP) | |||||
| ld r26, 184(SP) | |||||
| ld r25, 192(SP) | |||||
| ld r24, 200(SP) | |||||
| ld r23, 208(SP) | |||||
| ld r22, 216(SP) | |||||
| ld r21, 224(SP) | |||||
| ld r20, 232(SP) | |||||
| ld r19, 240(SP) | |||||
| ld r18, 248(SP) | |||||
| ld r17, 256(SP) | |||||
| ld r16, 264(SP) | |||||
| ld r15, 272(SP) | |||||
| ld r14, 280(SP) | |||||
| ld r13, 288(SP) | |||||
| #else | |||||
| lwz r31, 144(SP) | |||||
| lwz r30, 148(SP) | |||||
| lwz r29, 152(SP) | |||||
| lwz r28, 156(SP) | |||||
| lwz r27, 160(SP) | |||||
| lwz r26, 164(SP) | |||||
| lwz r25, 168(SP) | |||||
| lwz r24, 172(SP) | |||||
| lwz r23, 176(SP) | |||||
| lwz r22, 180(SP) | |||||
| lwz r21, 184(SP) | |||||
| lwz r20, 188(SP) | |||||
| lwz r19, 192(SP) | |||||
| lwz r18, 196(SP) | |||||
| lwz r17, 200(SP) | |||||
| lwz r16, 204(SP) | |||||
| lwz r15, 208(SP) | |||||
| lwz r14, 212(SP) | |||||
| lwz r13, 216(SP) | |||||
| #endif | |||||
| addi SP, SP, STACKSIZE | |||||
| blr | |||||
| EPILOGUE | |||||
| #endif | |||||
| @@ -1959,6 +1959,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(POWER8) | |||||
| #define SNUMOPT 4 | |||||
| #define DNUMOPT 8 | |||||
| #define GEMM_DEFAULT_OFFSET_A 384 | |||||
| #define GEMM_DEFAULT_OFFSET_B 1024 | |||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 16 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define SGEMM_DEFAULT_P 992 | |||||
| #define DGEMM_DEFAULT_P 480 | |||||
| #define CGEMM_DEFAULT_P 488 | |||||
| #define ZGEMM_DEFAULT_P 240 | |||||
| #define SGEMM_DEFAULT_Q 504 | |||||
| #define DGEMM_DEFAULT_Q 720 | |||||
| #define CGEMM_DEFAULT_Q 400 | |||||
| #define ZGEMM_DEFAULT_Q 360 | |||||
| #define SGEMM_DEFAULT_R 28800 | |||||
| #define DGEMM_DEFAULT_R 14400 | |||||
| #define ZGEMM_DEFAULT_R 7200 | |||||
| #define SYMV_P 8 | |||||
| #endif | |||||
| #if defined(SPARC) && defined(V7) | #if defined(SPARC) && defined(V7) | ||||
| #define SNUMOPT 4 | #define SNUMOPT 4 | ||||