power9 makefile. dgemm based on power8 kernel with following changes …tags/v0.3.6^2
| @@ -9,7 +9,15 @@ else | |||||
| USE_OPENMP = 1 | USE_OPENMP = 1 | ||||
| endif | endif | ||||
| ifeq ($(CORE), POWER9) | |||||
| ifeq ($(USE_OPENMP), 1) | |||||
| COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||||
| else | |||||
| COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math | |||||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math | |||||
| endif | |||||
| endif | |||||
| ifeq ($(CORE), POWER8) | ifeq ($(CORE), POWER8) | ||||
| ifeq ($(USE_OPENMP), 1) | ifeq ($(USE_OPENMP), 1) | ||||
| @@ -48,6 +48,7 @@ POWER5 | |||||
| POWER6 | POWER6 | ||||
| POWER7 | POWER7 | ||||
| POWER8 | POWER8 | ||||
| POWER9 | |||||
| PPCG4 | PPCG4 | ||||
| PPC970 | PPC970 | ||||
| PPC970MP | PPC970MP | ||||
| @@ -348,6 +348,11 @@ typedef int blasint; | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifdef POWER9 | |||||
| #ifndef YIELDING | |||||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||||
| #endif | |||||
| #endif | |||||
| /* | /* | ||||
| #ifdef PILEDRIVER | #ifdef PILEDRIVER | ||||
| @@ -39,7 +39,7 @@ | |||||
| #ifndef COMMON_POWER | #ifndef COMMON_POWER | ||||
| #define COMMON_POWER | #define COMMON_POWER | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #define MB __asm__ __volatile__ ("eieio":::"memory") | #define MB __asm__ __volatile__ ("eieio":::"memory") | ||||
| #define WMB __asm__ __volatile__ ("eieio":::"memory") | #define WMB __asm__ __volatile__ ("eieio":::"memory") | ||||
| #else | #else | ||||
| @@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| #define HAVE_PREFETCH | #define HAVE_PREFETCH | ||||
| #endif | #endif | ||||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || ( defined(PPC970) && defined(OS_DARWIN) ) | |||||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) ) | |||||
| #define DCBT_ARG 0 | #define DCBT_ARG 0 | ||||
| #else | #else | ||||
| #define DCBT_ARG 8 | #define DCBT_ARG 8 | ||||
| @@ -263,7 +263,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| #define L1_PREFETCH dcbtst | #define L1_PREFETCH dcbtst | ||||
| #endif | #endif | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #define L1_DUALFETCH | #define L1_DUALFETCH | ||||
| #define L1_PREFETCHSIZE (16 + 128 * 100) | #define L1_PREFETCHSIZE (16 + 128 * 100) | ||||
| #define L1_PREFETCH dcbtst | #define L1_PREFETCH dcbtst | ||||
| @@ -812,7 +812,7 @@ Lmcount$lazy_ptr: | |||||
| #define BUFFER_SIZE ( 2 << 20) | #define BUFFER_SIZE ( 2 << 20) | ||||
| #elif defined(PPC440FP2) | #elif defined(PPC440FP2) | ||||
| #define BUFFER_SIZE ( 16 << 20) | #define BUFFER_SIZE ( 16 << 20) | ||||
| #elif defined(POWER8) | |||||
| #elif defined(POWER8) || defined(POWER9) | |||||
| #define BUFFER_SIZE ( 64 << 20) | #define BUFFER_SIZE ( 64 << 20) | ||||
| #else | #else | ||||
| #define BUFFER_SIZE ( 16 << 20) | #define BUFFER_SIZE ( 16 << 20) | ||||
| @@ -94,7 +94,7 @@ char *corename[] = { | |||||
| "CELL", | "CELL", | ||||
| "PPCG4", | "PPCG4", | ||||
| "POWER8", | "POWER8", | ||||
| "POWER8" | |||||
| "POWER9" | |||||
| }; | }; | ||||
| int detect(void){ | int detect(void){ | ||||
| @@ -124,7 +124,7 @@ int detect(void){ | |||||
| if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | ||||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | ||||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | ||||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; | |||||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; | |||||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | ||||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | ||||
| @@ -156,7 +156,7 @@ int detect(void){ | |||||
| if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | ||||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | ||||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | ||||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; | |||||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; | |||||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | ||||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | ||||
| return CPUTYPE_POWER5; | return CPUTYPE_POWER5; | ||||
| @@ -180,7 +180,7 @@ int id; | |||||
| __asm __volatile("mfpvr %0" : "=r"(id)); | __asm __volatile("mfpvr %0" : "=r"(id)); | ||||
| switch ( id >> 16 ) { | switch ( id >> 16 ) { | ||||
| case 0x4e: // POWER9 | case 0x4e: // POWER9 | ||||
| return CPUTYPE_POWER8; | |||||
| return CPUTYPE_POWER9; | |||||
| break; | break; | ||||
| case 0x4d: | case 0x4d: | ||||
| case 0x4b: // POWER8/8E | case 0x4b: // POWER8/8E | ||||
| @@ -637,6 +637,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define CORENAME "POWER8" | #define CORENAME "POWER8" | ||||
| #endif | #endif | ||||
| #if defined(FORCE_POWER9) | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "POWER" | |||||
| #define SUBARCHITECTURE "POWER9" | |||||
| #define SUBDIRNAME "power" | |||||
| #define ARCHCONFIG "-DPOWER9 " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ | |||||
| "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||||
| #define LIBNAME "power9" | |||||
| #define CORENAME "POWER9" | |||||
| #endif | |||||
| #ifdef FORCE_PPCG4 | #ifdef FORCE_PPCG4 | ||||
| #define FORCE | #define FORCE | ||||
| @@ -44,6 +44,10 @@ ifeq ($(CORE), POWER8) | |||||
| USE_TRMM = 1 | USE_TRMM = 1 | ||||
| endif | endif | ||||
| ifeq ($(CORE), POWER9) | |||||
| USE_TRMM = 1 | |||||
| endif | |||||
| ifeq ($(ARCH), zarch) | ifeq ($(ARCH), zarch) | ||||
| USE_TRMM = 1 | USE_TRMM = 1 | ||||
| endif | endif | ||||
| @@ -0,0 +1,184 @@ | |||||
| #SGEMM_BETA = ../generic/gemm_beta.c | |||||
| #DGEMM_BETA = ../generic/gemm_beta.c | |||||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||||
| #ZGEMM_BETA = ../generic/zgemm_beta.c | |||||
| STRMMKERNEL = strmm_kernel_16x8_power8.S | |||||
| DTRMMKERNEL = dgemm_kernel_power9.S | |||||
| CTRMMKERNEL = ctrmm_kernel_8x4_power8.S | |||||
| ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S | |||||
| SGEMMKERNEL = sgemm_kernel_16x8_power8.S | |||||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||||
| SGEMMINCOPYOBJ = sgemm_incopy.o | |||||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| DGEMMKERNEL = dgemm_kernel_power9.S | |||||
| DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||||
| DGEMMITCOPY = dgemm_tcopy_16_power8.S | |||||
| DGEMMONCOPY = dgemm_ncopy_4_power8.S | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
| CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||||
| CGEMMITCOPY = cgemm_tcopy_8_power8.S | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||||
| ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||||
| ZGEMMITCOPY = zgemm_tcopy_8_power8.S | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| ZGEMMINCOPYOBJ = zgemm_incopy.o | |||||
| ZGEMMITCOPYOBJ = zgemm_itcopy.o | |||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| #Todo: CGEMM3MKERNEL should be 4x4 blocksizes. | |||||
| #CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S | |||||
| #ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S | |||||
| #Pure C for other kernels | |||||
| #SAMAXKERNEL = ../arm/amax.c | |||||
| #DAMAXKERNEL = ../arm/amax.c | |||||
| #CAMAXKERNEL = ../arm/zamax.c | |||||
| #ZAMAXKERNEL = ../arm/zamax.c | |||||
| # | |||||
| #SAMINKERNEL = ../arm/amin.c | |||||
| #DAMINKERNEL = ../arm/amin.c | |||||
| #CAMINKERNEL = ../arm/zamin.c | |||||
| #ZAMINKERNEL = ../arm/zamin.c | |||||
| # | |||||
| #SMAXKERNEL = ../arm/max.c | |||||
| #DMAXKERNEL = ../arm/max.c | |||||
| # | |||||
| #SMINKERNEL = ../arm/min.c | |||||
| #DMINKERNEL = ../arm/min.c | |||||
| # | |||||
| ISAMAXKERNEL = isamax.c | |||||
| IDAMAXKERNEL = idamax.c | |||||
| ICAMAXKERNEL = icamax.c | |||||
| IZAMAXKERNEL = izamax.c | |||||
| # | |||||
| ISAMINKERNEL = isamin.c | |||||
| IDAMINKERNEL = idamin.c | |||||
| ICAMINKERNEL = icamin.c | |||||
| IZAMINKERNEL = izamin.c | |||||
| # | |||||
| #ISMAXKERNEL = ../arm/imax.c | |||||
| #IDMAXKERNEL = ../arm/imax.c | |||||
| # | |||||
| #ISMINKERNEL = ../arm/imin.c | |||||
| #IDMINKERNEL = ../arm/imin.c | |||||
| # | |||||
| SASUMKERNEL = sasum.c | |||||
| DASUMKERNEL = dasum.c | |||||
| CASUMKERNEL = casum.c | |||||
| ZASUMKERNEL = zasum.c | |||||
| # | |||||
| SAXPYKERNEL = saxpy.c | |||||
| DAXPYKERNEL = daxpy.c | |||||
| CAXPYKERNEL = caxpy.c | |||||
| ZAXPYKERNEL = zaxpy.c | |||||
| # | |||||
| SCOPYKERNEL = scopy.c | |||||
| DCOPYKERNEL = dcopy.c | |||||
| CCOPYKERNEL = ccopy.c | |||||
| ZCOPYKERNEL = zcopy.c | |||||
| # | |||||
| SDOTKERNEL = sdot.c | |||||
| DDOTKERNEL = ddot.c | |||||
| DSDOTKERNEL = sdot.c | |||||
| CDOTKERNEL = cdot.c | |||||
| ZDOTKERNEL = zdot.c | |||||
| # | |||||
| SNRM2KERNEL = ../arm/nrm2.c | |||||
| DNRM2KERNEL = ../arm/nrm2.c | |||||
| CNRM2KERNEL = ../arm/znrm2.c | |||||
| ZNRM2KERNEL = ../arm/znrm2.c | |||||
| # | |||||
| SROTKERNEL = srot.c | |||||
| DROTKERNEL = drot.c | |||||
| CROTKERNEL = crot.c | |||||
| ZROTKERNEL = zrot.c | |||||
| # | |||||
| SSCALKERNEL = sscal.c | |||||
| DSCALKERNEL = dscal.c | |||||
| CSCALKERNEL = zscal.c | |||||
| ZSCALKERNEL = zscal.c | |||||
| # | |||||
| SSWAPKERNEL = sswap.c | |||||
| DSWAPKERNEL = dswap.c | |||||
| CSWAPKERNEL = cswap.c | |||||
| ZSWAPKERNEL = zswap.c | |||||
| # | |||||
| SGEMVNKERNEL = sgemv_n.c | |||||
| DGEMVNKERNEL = dgemv_n.c | |||||
| CGEMVNKERNEL = cgemv_n.c | |||||
| ZGEMVNKERNEL = zgemv_n_4.c | |||||
| # | |||||
| SGEMVTKERNEL = sgemv_t.c | |||||
| DGEMVTKERNEL = dgemv_t.c | |||||
| CGEMVTKERNEL = cgemv_t.c | |||||
| ZGEMVTKERNEL = zgemv_t_4.c | |||||
| #SSYMV_U_KERNEL = ../generic/symv_k.c | |||||
| #SSYMV_L_KERNEL = ../generic/symv_k.c | |||||
| #DSYMV_U_KERNEL = ../generic/symv_k.c | |||||
| #DSYMV_L_KERNEL = ../generic/symv_k.c | |||||
| #QSYMV_U_KERNEL = ../generic/symv_k.c | |||||
| #QSYMV_L_KERNEL = ../generic/symv_k.c | |||||
| #CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
| #CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
| #ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
| #ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
| #XSYMV_U_KERNEL = ../generic/zsymv_k.c | |||||
| #XSYMV_L_KERNEL = ../generic/zsymv_k.c | |||||
| #ZHEMV_U_KERNEL = ../generic/zhemv_k.c | |||||
| #ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||||
| LSAME_KERNEL = ../generic/lsame.c | |||||
| SCABS_KERNEL = ../generic/cabs.c | |||||
| DCABS_KERNEL = ../generic/cabs.c | |||||
| QCABS_KERNEL = ../generic/cabs.c | |||||
| #Dump kernel | |||||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||||
| @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "casum_microk_power8.c" | #include "casum_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "ccopy_microk_power8.c" | #include "ccopy_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | ||||
| { | { | ||||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "cswap_microk_power8.c" | #include "cswap_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "dasum_microk_power8.c" | #include "dasum_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "daxpy_microk_power8.c" | #include "daxpy_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "dcopy_microk_power8.c" | #include "dcopy_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "ddot_microk_power8.c" | #include "ddot_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,249 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #include "def_vsx.h" | |||||
| #define LOAD ld | |||||
| #define STACKSIZE (512 ) | |||||
| #define ALPHA_SP (296+192)(SP) | |||||
| #define FZERO (304+192)(SP) | |||||
| #define M r3 | |||||
| #define N r4 | |||||
| #define K r5 | |||||
| #define A r7 | |||||
| #define B r8 | |||||
| #define C r9 | |||||
| #define LDC r10 | |||||
| #define OFFSET r6 | |||||
| #define alpha_r vs18 | |||||
| #define o0 0 | |||||
| #define T4 r12 | |||||
| #define T3 r11 | |||||
| #define C4 r14 | |||||
| #define o8 r15 | |||||
| #define o24 r16 | |||||
| #define C2 r17 | |||||
| #define L r18 | |||||
| #define T1 r19 | |||||
| #define C3 r20 | |||||
| #define TEMP_REG r21 | |||||
| #define I r22 | |||||
| #define J r23 | |||||
| #define AO r24 | |||||
| #define BO r25 | |||||
| #define CO r26 | |||||
| #define o16 r27 | |||||
| #define o32 r28 | |||||
| #define o48 r29 | |||||
| #define PRE r30 | |||||
| #define T2 r31 | |||||
| #include "dgemm_macros_power9.S" | |||||
| #ifndef NEEDPARAM | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| addi SP, SP, -STACKSIZE | |||||
| li r0, 0 | |||||
| stfd f14, 0(SP) | |||||
| stfd f15, 8(SP) | |||||
| stfd f16, 16(SP) | |||||
| stfd f17, 24(SP) | |||||
| stfd f18, 32(SP) | |||||
| stfd f19, 40(SP) | |||||
| stfd f20, 48(SP) | |||||
| stfd f21, 56(SP) | |||||
| stfd f22, 64(SP) | |||||
| stfd f23, 72(SP) | |||||
| stfd f24, 80(SP) | |||||
| stfd f25, 88(SP) | |||||
| stfd f26, 96(SP) | |||||
| stfd f27, 104(SP) | |||||
| stfd f28, 112(SP) | |||||
| stfd f29, 120(SP) | |||||
| stfd f30, 128(SP) | |||||
| stfd f31, 136(SP) | |||||
| std r31, 144(SP) | |||||
| std r30, 152(SP) | |||||
| std r29, 160(SP) | |||||
| std r28, 168(SP) | |||||
| std r27, 176(SP) | |||||
| std r26, 184(SP) | |||||
| std r25, 192(SP) | |||||
| std r24, 200(SP) | |||||
| std r23, 208(SP) | |||||
| std r22, 216(SP) | |||||
| std r21, 224(SP) | |||||
| std r20, 232(SP) | |||||
| std r19, 240(SP) | |||||
| std r18, 248(SP) | |||||
| std r17, 256(SP) | |||||
| std r16, 264(SP) | |||||
| std r15, 272(SP) | |||||
| std r14, 280(SP) | |||||
| stxv v20, 288(SP) | |||||
| stxv v21, 304(SP) | |||||
| stxv v22, 320(SP) | |||||
| stxv v23, 336(SP) | |||||
| stxv v24, 352(SP) | |||||
| stxv v25, 368(SP) | |||||
| stxv v26, 384(SP) | |||||
| stxv v27, 400(SP) | |||||
| stxv v28, 416(SP) | |||||
| stxv v29, 432(SP) | |||||
| stxv v30, 448(SP) | |||||
| stxv v31, 464(SP) | |||||
| stfd f1, ALPHA_SP | |||||
| stw r0, FZERO | |||||
| slwi LDC, LDC, BASE_SHIFT | |||||
| #if defined(TRMMKERNEL) | |||||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||||
| #endif | |||||
| cmpwi cr0, M, 0 | |||||
| ble .L999_H1 | |||||
| cmpwi cr0, N, 0 | |||||
| ble .L999_H1 | |||||
| cmpwi cr0, K, 0 | |||||
| ble .L999_H1 | |||||
| addi T1, SP, 296+192 | |||||
| li PRE, 384 | |||||
| li o8 , 8 | |||||
| li o16, 16 | |||||
| li o24, 24 | |||||
| li o32, 32 | |||||
| li o48, 48 | |||||
| lxvdsx alpha_r, 0, T1 | |||||
| #include "dgemm_logic_power9.S" | |||||
| .L999: | |||||
| addi r3, 0, 0 | |||||
| lfd f14, 0(SP) | |||||
| lfd f15, 8(SP) | |||||
| lfd f16, 16(SP) | |||||
| lfd f17, 24(SP) | |||||
| lfd f18, 32(SP) | |||||
| lfd f19, 40(SP) | |||||
| lfd f20, 48(SP) | |||||
| lfd f21, 56(SP) | |||||
| lfd f22, 64(SP) | |||||
| lfd f23, 72(SP) | |||||
| lfd f24, 80(SP) | |||||
| lfd f25, 88(SP) | |||||
| lfd f26, 96(SP) | |||||
| lfd f27, 104(SP) | |||||
| lfd f28, 112(SP) | |||||
| lfd f29, 120(SP) | |||||
| lfd f30, 128(SP) | |||||
| lfd f31, 136(SP) | |||||
| ld r31, 144(SP) | |||||
| ld r30, 152(SP) | |||||
| ld r29, 160(SP) | |||||
| ld r28, 168(SP) | |||||
| ld r27, 176(SP) | |||||
| ld r26, 184(SP) | |||||
| ld r25, 192(SP) | |||||
| ld r24, 200(SP) | |||||
| ld r23, 208(SP) | |||||
| ld r22, 216(SP) | |||||
| ld r21, 224(SP) | |||||
| ld r20, 232(SP) | |||||
| ld r19, 240(SP) | |||||
| ld r18, 248(SP) | |||||
| ld r17, 256(SP) | |||||
| ld r16, 264(SP) | |||||
| ld r15, 272(SP) | |||||
| ld r14, 280(SP) | |||||
| lxv v20, 288(SP) | |||||
| lxv v21, 304(SP) | |||||
| lxv v22, 320(SP) | |||||
| lxv v23, 336(SP) | |||||
| lxv v24, 352(SP) | |||||
| lxv v25, 368(SP) | |||||
| lxv v26, 384(SP) | |||||
| lxv v27, 400(SP) | |||||
| lxv v28, 416(SP) | |||||
| lxv v29, 432(SP) | |||||
| lxv v30, 448(SP) | |||||
| lxv v31, 464(SP) | |||||
| addi SP, SP, STACKSIZE | |||||
| blr | |||||
| EPILOGUE | |||||
| #endif | |||||
| @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "dgemv_n_microk_power8.c" | #include "dgemv_n_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #pragma GCC optimize "O1" | #pragma GCC optimize "O1" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "drot_microk_power8.c" | #include "drot_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "dscal_microk_power8.c" | #include "dscal_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "dswap_microk_power8.c" | #include "dswap_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "sasum_microk_power8.c" | #include "sasum_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "scopy_microk_power8.c" | #include "scopy_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "sdot_microk_power8.c" | #include "sdot_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #pragma GCC optimize "O1" | #pragma GCC optimize "O1" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "srot_microk_power8.c" | #include "srot_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "sscal_microk_power8.c" | #include "sscal_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "sswap_microk_power8.c" | #include "sswap_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "zasum_microk_power8.c" | #include "zasum_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -36,19 +36,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "zaxpy_microk_power8.c" | #include "zaxpy_microk_power8.c" | ||||
| #endif | #endif | ||||
| #ifndef HAVE_KERNEL_4 | #ifndef HAVE_KERNEL_4 | ||||
| static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) | |||||
| { | { | ||||
| BLASLONG register i = 0; | BLASLONG register i = 0; | ||||
| BLASLONG register ix = 0; | BLASLONG register ix = 0; | ||||
| FLOAT da_r = alpha[0]; | |||||
| FLOAT da_i = alpha[1]; | |||||
| while(i < n) | while(i < n) | ||||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "zcopy_microk_power8.c" | #include "zcopy_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "zdot_microk_power8.c" | #include "zdot_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #pragma GCC optimize "O1" | #pragma GCC optimize "O1" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #if defined(DOUBLE) | #if defined(DOUBLE) | ||||
| #include "zscal_microk_power8.c" | #include "zscal_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(POWER8) | |||||
| #if defined(POWER8) || defined(POWER9) | |||||
| #include "zswap_microk_power8.c" | #include "zswap_microk_power8.c" | ||||
| #endif | #endif | ||||
| @@ -2230,6 +2230,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| #if defined(POWER9) | |||||
| #define SNUMOPT 16 | |||||
| #define DNUMOPT 8 | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||||
| #define GEMM_DEFAULT_OFFSET_B 65536 | |||||
| #define GEMM_DEFAULT_ALIGN 0x0ffffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 16 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||||
| #define SGEMM_DEFAULT_P 1280 | |||||
| #define DGEMM_DEFAULT_P 128 | |||||
| #define CGEMM_DEFAULT_P 640 | |||||
| #define ZGEMM_DEFAULT_P 320 | |||||
| #define SGEMM_DEFAULT_Q 640 | |||||
| #define DGEMM_DEFAULT_Q 384 | |||||
| #define CGEMM_DEFAULT_Q 640 | |||||
| #define ZGEMM_DEFAULT_Q 640 | |||||
| #define SYMV_P 8 | |||||
| #endif | |||||
| #if defined(SPARC) && defined(V7) | #if defined(SPARC) && defined(V7) | ||||