Browse Source

Merge branch 'develop' into aix

tags/v0.3.11^2
Martin Kroeker GitHub 5 years ago
parent
commit
ed7e155c35
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 1321 additions and 200 deletions
  1. +9
    -2
      Makefile.system
  2. +10
    -0
      Makefile.x86_64
  3. +24
    -0
      cpuid_x86.c
  4. +12
    -0
      driver/others/dynamic.c
  5. +119
    -119
      kernel/Makefile.L3
  6. +16
    -16
      kernel/generic/gemm_ncopy_16.c
  7. +22
    -22
      kernel/generic/gemm_ncopy_8.c
  8. +13
    -13
      kernel/generic/gemm_tcopy_16.c
  9. +23
    -23
      kernel/generic/gemm_tcopy_8.c
  10. +11
    -0
      kernel/power/KERNEL.POWER10
  11. +2
    -2
      kernel/power/cgemm_kernel_8x4_power8.S
  12. +1044
    -0
      kernel/power/shgemm_kernel_power10.c
  13. +1
    -1
      kernel/x86_64/sgemm_kernel_8x4_haswell_2.c
  14. +1
    -1
      kernel/x86_64/strsm_kernel_8x4_haswell_LN.c
  15. +1
    -1
      kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h
  16. +13
    -0
      param.h

+ 9
- 2
Makefile.system View File

@@ -286,8 +286,15 @@ GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5)
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
# Note that the behavior of -dumpversion is compile-time-configurable for
# gcc-7.x and newer. Use -dumpfullversion there
ifeq ($(GCCVERSIONGTEQ7),1)
GCCDUMPVERSION_PARAM := -dumpfullversion
else
GCCDUMPVERSION_PARAM := -dumpversion
endif
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
endif

#


+ 10
- 0
Makefile.x86_64 View File

@@ -31,14 +31,24 @@ ifeq ($(CORE), HASWELL)
ifndef DYNAMIC_ARCH
ifndef NO_AVX2
ifeq ($(C_COMPILER), GCC)
# AVX2 support was added in 4.7.0
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
CCOMMON_OPT += -mavx2
endif
endif
ifeq ($(F_COMPILER), GFORTRAN)
# AVX2 support was added in 4.7.0
GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
FCOMMON_OPT += -mavx2
endif
endif
endif
endif
endif





+ 24
- 0
cpuid_x86.c View File

@@ -1406,6 +1406,17 @@ int get_cpuname(void){
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
case 10: //family 6 exmodel 10
switch (model) {
case 5: // Comet Lake H and S
case 6: // Comet Lake U
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
}
@@ -1955,6 +1966,19 @@ int get_coretype(void){
return CORE_NEHALEM;
}
break;
case 10:
switch (model) {
case 5: // Comet Lake H and S
case 6: // Comet Lake U
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
}
case 5:
switch (model) {
case 6:


+ 12
- 0
driver/others/dynamic.c View File

@@ -618,6 +618,18 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
case 10:
if (model == 5 || model == 6) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
}
case 0xf:


+ 119
- 119
kernel/Makefile.L3 View File

@@ -482,8 +482,8 @@ $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY)

$(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY)

ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmotcopy.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s
m4 shgemmotcopy.s > shgemmotcopy_nomacros.s
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@
rm shgemmotcopy.s shgemmotcopy_nomacros.s
@@ -497,8 +497,8 @@ $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY)
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@

$(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmitcopy.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s
m4 shgemmitcopy.s > shgemmitcopy_nomacros.s
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@
rm shgemmitcopy.s shgemmitcopy_nomacros.s
@@ -513,8 +513,8 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@

$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s
m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
rm sgemmotcopy.s sgemmotcopy_nomacros.s
@@ -529,8 +529,8 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@

$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s
m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
rm sgemmitcopy.s sgemmitcopy_nomacros.s
@@ -541,8 +541,8 @@ endif
endif

$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s
m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
@@ -559,8 +559,8 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@

$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s
m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
@@ -602,8 +602,8 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@

$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s
m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
@@ -625,8 +625,8 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@

$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s
m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
@@ -657,8 +657,8 @@ endif
endif

$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemm_kernel$(TSUFFIX).s
m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
@@ -669,8 +669,8 @@ endif
ifeq ($(BUILD_HALF), 1)

$(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemm_kernel$(TSUFFIX).s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemm_kernel$(TSUFFIX).s
m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@
rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s
@@ -680,8 +680,8 @@ endif
endif

$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s
m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
@@ -693,9 +693,9 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP
$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@

$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s
m4 -B 16384 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s
m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
else
@@ -703,9 +703,9 @@ else
endif

$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s
m4 -B 16384 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s
m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
else
@@ -713,9 +713,9 @@ else
endif

$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
m4 -B 16384 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
else
@@ -723,9 +723,9 @@ else
endif

$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s
m4 -B 16384 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s
m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
else
@@ -733,9 +733,9 @@ else
endif

$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s
m4 -B 16384 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s
m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
else
@@ -743,9 +743,9 @@ else
endif

$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s
m4 -B 16384 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s
m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
else
@@ -753,9 +753,9 @@ else
endif

$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s
m4 -B 16384 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s
m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
else
@@ -763,9 +763,9 @@ else
endif

$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s
m4 -B 16384 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s
m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
else
@@ -787,8 +787,8 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD

ifdef USE_TRMM
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s
m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
@@ -797,8 +797,8 @@ else
endif

$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s
m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
@@ -807,8 +807,8 @@ else
endif

$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s
m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
@@ -817,8 +817,8 @@ else
endif

$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
@@ -827,8 +827,8 @@ else
endif

$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s
m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
@@ -837,8 +837,8 @@ else
endif

$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s
m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
@@ -847,8 +847,8 @@ else
endif

$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s
m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
@@ -857,8 +857,8 @@ else
endif

$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s
m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
@@ -879,9 +879,9 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@

$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s
m4 -B 16384 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_ln.s
m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
else
@@ -889,9 +889,9 @@ else
endif

$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s
m4 -B 16384 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s
m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
else
@@ -899,9 +899,9 @@ else
endif

$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s
m4 -B 16384 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s
m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@
rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
else
@@ -909,9 +909,9 @@ else
endif

$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s
m4 -B 16384 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s
m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
else
@@ -919,9 +919,9 @@ else
endif

$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s
m4 -B 16384 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s
m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
else
@@ -929,9 +929,9 @@ else
endif

$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s
m4 -B 16384 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s
m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
else
@@ -939,9 +939,9 @@ else
endif

$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s
m4 -B 16384 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s
m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
else
@@ -949,9 +949,9 @@ else
endif

$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s
m4 -B 16384 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s
m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
else
@@ -959,9 +959,9 @@ else
endif

$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s
m4 -B 16384 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s
m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
else
@@ -969,9 +969,9 @@ else
endif

$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s
m4 -B 16384 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s
m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
else
@@ -979,9 +979,9 @@ else
endif

$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s
m4 -B 16384 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s
m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
else
@@ -989,9 +989,9 @@ else
endif

$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s
m4 -B 16384 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s
m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s
else
@@ -999,9 +999,9 @@ else
endif

$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s
m4 -B 16384 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s
m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
else
@@ -1009,9 +1009,9 @@ else
endif

$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s
m4 -B 16384 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s
m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
else
@@ -1019,8 +1019,8 @@ else
endif

$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s
m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
@@ -1029,9 +1029,9 @@ else
endif

$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s
m4 -B 16384 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s
m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
else
@@ -1049,9 +1049,9 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@

$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
m4 -B 16384 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
else
@@ -1183,9 +1183,9 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@

$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s
m4 -B 16384 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s
m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
else
@@ -2459,8 +2459,8 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@

$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
ifeq ($(OS), AIX)
$(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
@@ -2505,8 +2505,8 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@

$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
ifeq ($(findstring AIXPOW, $(OS)$(filter $(TARGET), POWER8 POWER9 POWER10)),AIXPOW)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s
m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s


+ 16
- 16
kernel/generic/gemm_ncopy_16.c View File

@@ -39,24 +39,24 @@
#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG i, j;

FLOAT *aoffset;
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
FLOAT *boffset;
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
IFLOAT *aoffset;
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
IFLOAT *boffset;
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;

aoffset = a;
boffset = b;


+ 22
- 22
kernel/generic/gemm_ncopy_8.c View File

@@ -39,30 +39,30 @@
#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG i, j;

FLOAT *aoffset;
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
FLOAT *boffset;
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
IFLOAT *aoffset;
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
IFLOAT *boffset;
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
IFLOAT ctemp33, ctemp34, ctemp35, ctemp36;
IFLOAT ctemp37, ctemp38, ctemp39, ctemp40;
IFLOAT ctemp41, ctemp42, ctemp43, ctemp44;
IFLOAT ctemp45, ctemp46, ctemp47, ctemp48;
IFLOAT ctemp49, ctemp50, ctemp51, ctemp52;
IFLOAT ctemp53, ctemp54, ctemp55, ctemp56;
IFLOAT ctemp57, ctemp58, ctemp59, ctemp60;
IFLOAT ctemp61, ctemp62, ctemp63, ctemp64;


aoffset = a;


+ 13
- 13
kernel/generic/gemm_tcopy_16.c View File

@@ -39,22 +39,22 @@
#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){

BLASLONG i, j;

FLOAT *aoffset;
FLOAT *aoffset1, *aoffset2;
FLOAT *boffset;
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
IFLOAT *aoffset;
IFLOAT *aoffset1, *aoffset2;
IFLOAT *boffset;
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;

aoffset = a;
boffset = b;


+ 23
- 23
kernel/generic/gemm_tcopy_8.c View File

@@ -39,32 +39,32 @@
#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){

BLASLONG i, j;

FLOAT *aoffset;
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
IFLOAT *aoffset;
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
IFLOAT ctemp33, ctemp34, ctemp35, ctemp36;
IFLOAT ctemp37, ctemp38, ctemp39, ctemp40;
IFLOAT ctemp41, ctemp42, ctemp43, ctemp44;
IFLOAT ctemp45, ctemp46, ctemp47, ctemp48;
IFLOAT ctemp49, ctemp50, ctemp51, ctemp52;
IFLOAT ctemp53, ctemp54, ctemp55, ctemp56;
IFLOAT ctemp57, ctemp58, ctemp59, ctemp60;
IFLOAT ctemp61, ctemp62, ctemp63, ctemp64;

aoffset = a;
boffset = b;


+ 11
- 0
kernel/power/KERNEL.POWER10 View File

@@ -7,6 +7,17 @@ else
#CGEMM_BETA = ../generic/zgemm_beta.c
#ZGEMM_BETA = ../generic/zgemm_beta.c

SHGEMM_BETA = ../generic/gemm_beta.c
SHGEMMKERNEL = shgemm_kernel_power10.c
SHGEMMINCOPY = ../generic/gemm_ncopy_16.c
SHGEMMITCOPY = ../generic/gemm_tcopy_16.c
SHGEMMONCOPY = ../generic/gemm_ncopy_8.c
SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)

STRMMKERNEL = sgemm_kernel_power10.c
DTRMMKERNEL = dgemm_kernel_power10.c
CTRMMKERNEL = cgemm_kernel_power10.S


+ 2
- 2
kernel/power/cgemm_kernel_8x4_power8.S View File

@@ -424,7 +424,7 @@ L999:
lwz r16, 204(SP)
lwz r15, 208(SP)
lwz r14, 212(SP)
addi r11, 224
addi r11, SP, 224
#endif
lvx v20, r11, r0
addi r11, r11, 16
@@ -459,4 +459,4 @@ L999:
blr

EPILOGUE
#endif^
#endif

+ 1044
- 0
kernel/power/shgemm_kernel_power10.c
File diff suppressed because it is too large
View File


+ 1
- 1
kernel/x86_64/sgemm_kernel_8x4_haswell_2.c View File

@@ -1,4 +1,4 @@
/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */
/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = b_pref */
/* r10 = tmp, r11 = m_counter, r12 = k << 2(const), r13 = tmp, r14 = b_head_pos(const), r15 = tmp */

/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */


+ 1
- 1
kernel/x86_64/strsm_kernel_8x4_haswell_LN.c View File

@@ -1,4 +1,4 @@
#include "common.h"
#include "common.h"
#include <stdint.h>
#include "strsm_kernel_8x4_haswell_L_common.h"



+ 1
- 1
kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h View File

@@ -1,4 +1,4 @@
/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */
/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */
/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */
/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */



+ 13
- 0
param.h View File

@@ -2297,6 +2297,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#endif

#if defined(POWER10)
#undef SHGEMM_DEFAULT_UNROLL_N
#undef SHGEMM_DEFAULT_UNROLL_M
#undef SHGEMM_DEFAULT_P
#undef SHGEMM_DEFAULT_R
#undef SHGEMM_DEFAULT_Q
#define SHGEMM_DEFAULT_UNROLL_M 16
#define SHGEMM_DEFAULT_UNROLL_N 8
#define SHGEMM_DEFAULT_P 832
#define SHGEMM_DEFAULT_Q 1026
#define SHGEMM_DEFAULT_R 4096
#endif

#if defined(SPARC) && defined(V7)

#define SNUMOPT 4


Loading…
Cancel
Save