| @@ -190,4 +190,7 @@ In chronological order: | |||
| * [2020-09-07] Fix builds with clang on IBM z, including dynamic architecture support | |||
| * Danfeng Zhang <https://github.com/craft-zhang> | |||
| * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 | |||
| * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 | |||
| * PingTouGe Semiconductor Co., Ltd. | |||
| * [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910 | |||
| @@ -3,21 +3,29 @@ RANLIB = ranlib | |||
| ifdef BINARY64 | |||
| ifeq ($(C_COMPILER), GCC) | |||
| CCOMMON_OPT += -mcpu=v9 -m64 | |||
| else | |||
| CCOMMON_OPT += -m64 | |||
| endif | |||
| ifeq ($(COMPILER_F77), g77) | |||
| FCOMMON_OPT += -mcpu=v9 -m64 | |||
| endif | |||
| ifeq ($(COMPILER_F77), f90) | |||
| FCOMMON_OPT += -xarch=v9 | |||
| ifeq ($(COMPILER_F77), f95) | |||
| FCOMMON_OPT += -m64 | |||
| endif | |||
| else | |||
| ifeq ($(C_COMPILER), GCC) | |||
| CCOMMON_OPT += -mcpu=v9 | |||
| else | |||
| CCOMMON_OPT += -xarch=v9 | |||
| endif | |||
| ifeq ($(COMPILER_F77), g77) | |||
| FCOMMON_OPT += -mcpu=v9 | |||
| endif | |||
| ifeq ($(COMPILER_F77), f90) | |||
| ifeq ($(COMPILER_F77), f95) | |||
| FCOMMON_OPT += -xarch=v8plusb | |||
| endif | |||
| @@ -37,4 +45,4 @@ LIBSUNPERF = -L/opt/SUNWspro/lib/v9 -L/opt/SUNWspro/prod/lib/v9 \ | |||
| else | |||
| LIBSUNPERF = -L/opt/SUNWspro/lib -L/opt/SUNWspro/prod/lib \ | |||
| -Wl,-R,/opt/SUNWspro/lib -lsunperf -lompstubs -lfui -lfsu -lsunmath | |||
| endif | |||
| endif | |||
| @@ -1131,16 +1131,25 @@ CCOMMON_OPT += -w | |||
| ifeq ($(ARCH), x86) | |||
| CCOMMON_OPT += -m32 | |||
| else | |||
| FCOMMON_OPT += -m64 | |||
| ifdef BINARY64 | |||
| CCOMMON_OPT += -m64 | |||
| else | |||
| CCOMMON_OPT += -m32 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(F_COMPILER), SUN) | |||
| CCOMMON_OPT += -DF_INTERFACE_SUN | |||
| FCOMMON_OPT += -ftrap=%none -xrecursive | |||
| ifeq ($(ARCH), x86) | |||
| FCOMMON_OPT += -m32 | |||
| else | |||
| ifdef BINARY64 | |||
| FCOMMON_OPT += -m64 | |||
| else | |||
| FCOMMON_OPT += -m32 | |||
| endif | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| FCOMMON_OPT += -xopenmp=parallel | |||
| @@ -1313,8 +1322,10 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH) | |||
| include $(TOPDIR)/Makefile.$(ARCH) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| ifneq ($(C_COMPILER), SUN) | |||
| CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME | |||
| endif | |||
| endif | |||
| CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" | |||
| ifeq ($(CORE), PPC440) | |||
| @@ -6,7 +6,8 @@ | |||
| # Checking cross compile | |||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
| $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); | |||
| $hostarch = `uname -p` if ($hostos eq "AIX"); | |||
| $hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS"); | |||
| chop($hostarch); | |||
| $hostarch = "x86_64" if ($hostarch eq "amd64"); | |||
| $hostarch = "arm" if ($hostarch ne "arm64" && $hostarch =~ /^arm.*/); | |||
| $hostarch = "arm64" if ($hostarch eq "aarch64"); | |||
| @@ -78,6 +78,12 @@ static __inline unsigned long rpcc(void){ | |||
| #define __BIG_ENDIAN__ | |||
| #endif | |||
| #ifdef C_SUN | |||
| #ifndef __64BIT | |||
| #define RETURN_BY_STACK | |||
| #endif | |||
| #endif | |||
| #ifdef DOUBLE | |||
| #define GET_IMAGE(res) __asm__ __volatile__("fmovd %%f2, %0" : "=f"(res) : : "memory") | |||
| #else | |||
| @@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| dot[0]=0.0; | |||
| dot[1]=0.0; | |||
| #if !defined(__PPC__) | |||
| #if !defined(__PPC__) && !defined(__SunOS) | |||
| CREAL(result) = 0.0 ; | |||
| CIMAG(result) = 0.0 ; | |||
| #else | |||
| @@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| i++ ; | |||
| } | |||
| #if !defined(__PPC__) | |||
| #if !defined(__PPC__) && !defined(__SunOS) | |||
| CREAL(result) = dot[0]; | |||
| CIMAG(result) = dot[1]; | |||
| #else | |||
| @@ -758,10 +758,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| pc0[1] += alphar * res1; \ | |||
| pc0[1] += alphai * res0; \ | |||
| \ | |||
| pc1[2] += alphar * res2; \ | |||
| pc1[2] -= alphai * res3; \ | |||
| pc1[3] += alphar * res3; \ | |||
| pc1[3] += alphai * res2; \ | |||
| pc1[0] += alphar * res2; \ | |||
| pc1[0] -= alphai * res3; \ | |||
| pc1[1] += alphar * res3; \ | |||
| pc1[1] += alphai * res2; \ | |||
| } | |||
| #define CGEMM_SCALE_1X1 \ | |||
| @@ -1067,10 +1067,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| pc0[1] = alphar * res1; \ | |||
| pc0[1] += alphai * res0; \ | |||
| \ | |||
| pc1[2] = alphar * res2; \ | |||
| pc1[2] -= alphai * res3; \ | |||
| pc1[3] = alphar * res3; \ | |||
| pc1[3] += alphai * res2; \ | |||
| pc1[0] = alphar * res2; \ | |||
| pc1[0] -= alphai * res3; \ | |||
| pc1[1] = alphar * res3; \ | |||
| pc1[1] += alphai * res2; \ | |||
| } | |||
| #define CGEMM_TRMM_SCALE_1X1 \ | |||
| @@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if !defined(XCONJ) | |||
| #define OP0 += | |||
| #define OP1 -= | |||
| #define OP2 -= | |||
| #define OP2 += | |||
| #else | |||
| #define OP0 -= | |||
| #define OP1 -= | |||
| #define OP2 += | |||
| #define OP2 -= | |||
| #endif | |||
| #endif | |||
| @@ -32,14 +32,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #undef OP1 | |||
| #undef OP2 | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| #define OP0 -= | |||
| #define OP1 += | |||
| #define OP2 += | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| #define OP0 -= | |||
| #define OP1 += | |||
| #define OP2 += | |||
| #else | |||
| #define OP0 += | |||
| #define OP1 += | |||
| #define OP2 -= | |||
| #endif | |||
| #else | |||
| #define OP0 += | |||
| #define OP1 += | |||
| #define OP2 -= | |||
| #if !defined(XCONJ) | |||
| #define OP0 += | |||
| #define OP1 -= | |||
| #define OP2 += | |||
| #else | |||
| #define OP0 -= | |||
| #define OP1 -= | |||
| #define OP2 -= | |||
| #endif | |||
| #endif | |||
| #define CGEMV_T_8x4() \ | |||
| @@ -184,7 +184,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, | |||
| } | |||
| } | |||
| } | |||
| else | |||
| else if ((inc_x != 0) && (inc_y != 0)) | |||
| { | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| @@ -248,6 +248,32 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if (inc_x == inc_y) | |||
| { | |||
| if (n & 1) | |||
| { | |||
| x0 = *srcx; | |||
| *srcx = *srcy; | |||
| *srcy = x0; | |||
| } | |||
| else | |||
| return (0); | |||
| } | |||
| else | |||
| { | |||
| BLASLONG ix = 0, iy = 0; | |||
| while (i < n) | |||
| { | |||
| x0 = srcx[ix]; | |||
| srcx[ix] = srcy[iy]; | |||
| srcy[iy] = x0; | |||
| ix += inc_x; | |||
| iy += inc_y; | |||
| i++; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -198,7 +198,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, | |||
| } | |||
| } | |||
| } | |||
| else | |||
| else if ((inc_x != 0) && (inc_y != 0)) | |||
| { | |||
| for (i = (n >> 3); i--;) | |||
| { | |||
| @@ -262,6 +262,33 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if (inc_x == inc_y) | |||
| { | |||
| if (n & 1) | |||
| { | |||
| x0 = *srcx; | |||
| *srcx = *srcy; | |||
| *srcy = x0; | |||
| } | |||
| else | |||
| return (0); | |||
| } | |||
| else | |||
| { | |||
| BLASLONG ix = 0, iy = 0; | |||
| while (i < n) | |||
| { | |||
| x0 = srcx[ix]; | |||
| srcx[ix] = srcy[iy]; | |||
| srcy[iy] = x0; | |||
| ix += inc_x; | |||
| iy += inc_y; | |||
| i++; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if !defined(XCONJ) | |||
| #define OP0 += | |||
| #define OP1 -= | |||
| #define OP2 -= | |||
| #define OP2 += | |||
| #else | |||
| #define OP0 -= | |||
| #define OP1 -= | |||
| #define OP2 += | |||
| #define OP2 -= | |||
| #endif | |||
| #endif | |||
| @@ -34,14 +34,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #undef OP3 | |||
| #undef OP4 | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| #define OP0 -= | |||
| #define OP1 += | |||
| #define OP2 += | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| #define OP0 -= | |||
| #define OP1 += | |||
| #define OP2 += | |||
| #else | |||
| #define OP0 += | |||
| #define OP1 += | |||
| #define OP2 -= | |||
| #endif | |||
| #else | |||
| #define OP0 += | |||
| #define OP1 += | |||
| #define OP2 -= | |||
| #if !defined(XCONJ) | |||
| #define OP0 += | |||
| #define OP1 -= | |||
| #define OP2 += | |||
| #else | |||
| #define OP0 -= | |||
| #define OP1 -= | |||
| #define OP2 -= | |||
| #endif | |||
| #endif | |||
| #define ZGEMV_T_8x1() \ | |||
| @@ -54,3 +54,13 @@ ZTRSMKERNEL_LN = ztrsm_kernel_LN.S | |||
| ZTRSMKERNEL_LT = ztrsm_kernel_LT.S | |||
| ZTRSMKERNEL_RN = ztrsm_kernel_LT.S | |||
| ZTRSMKERNEL_RT = ztrsm_kernel_RT.S | |||
| SDOTKERNEL = ../generic/dot.c | |||
| SDSDOTKERNEL = ../generic/dot.c | |||
| DSDOTKERNEL = ../generic/dot.c | |||
| DDOTKERNEL = ../generic/dot.c | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| CSWAPKERNEL = ../arm/zswap.c | |||
| ZSWAPKERNEL = ../arm/zswap.c | |||
| @@ -1454,22 +1454,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SGEMM_DEFAULT_P 768 | |||
| #define SGEMM_DEFAULT_R sgemm_r | |||
| //#define SGEMM_DEFAULT_R 1024 | |||
| /*#define SGEMM_DEFAULT_R 1024*/ | |||
| #define DGEMM_DEFAULT_P 512 | |||
| #define DGEMM_DEFAULT_R dgemm_r | |||
| //#define DGEMM_DEFAULT_R 1024 | |||
| /*#define DGEMM_DEFAULT_R 1024*/ | |||
| #define QGEMM_DEFAULT_P 504 | |||
| #define QGEMM_DEFAULT_R qgemm_r | |||
| #define CGEMM_DEFAULT_P 768 | |||
| #define CGEMM_DEFAULT_R cgemm_r | |||
| //#define CGEMM_DEFAULT_R 1024 | |||
| /*#define CGEMM_DEFAULT_R 1024*/ | |||
| #define ZGEMM_DEFAULT_P 512 | |||
| #define ZGEMM_DEFAULT_R zgemm_r | |||
| //#define ZGEMM_DEFAULT_R 1024 | |||
| /*#define ZGEMM_DEFAULT_R 1024*/ | |||
| #define XGEMM_DEFAULT_P 252 | |||
| #define XGEMM_DEFAULT_R xgemm_r | |||
| @@ -2571,7 +2571,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #ifdef LOONGSON3A | |||
| ////Copy from SICORTEX | |||
| /*Copy from SICORTEX*/ | |||
| #define SNUMOPT 2 | |||
| #define DNUMOPT 2 | |||
| @@ -2863,7 +2863,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SYMV_P 16 | |||
| #endif | |||
| // Common ARMv8 parameters | |||
| /* Common ARMv8 parameters */ | |||
| #if defined(ARMV8) | |||
| #define SNUMOPT 2 | |||
| @@ -3066,7 +3066,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #else // Other/undetected ARMv8 cores | |||
| #else /* Other/undetected ARMv8 cores */ | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| @@ -3095,9 +3095,9 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #endif // Cores | |||
| #endif /* Cores */ | |||
| #endif // ARMv8 | |||
| #endif /* ARMv8 */ | |||
| #if defined(ARMV5) | |||
| #define SNUMOPT 2 | |||
| @@ -35,6 +35,9 @@ endif | |||
| ifeq ($(C_COMPILER), PGI) | |||
| OBJS = utest_main2.o | |||
| endif | |||
| ifeq ($(C_COMPILER), SUN) | |||
| OBJS = utest_main2.o | |||
| endif | |||
| ifeq ($(OSNAME), AIX) | |||
| OBJS = utest_main2.o | |||
| endif | |||