Browse Source

Merge pull request #63 from xianyi/develop

rebase
tags/v0.3.10^2
Martin Kroeker GitHub 5 years ago
parent
commit
fdd1b50263
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 127 additions and 1491 deletions
  1. +10
    -1
      Makefile.system
  2. +8
    -8
      benchmark/Makefile
  3. +4
    -2
      benchmark/zdot.c
  4. +3
    -4
      common_x86_64.h
  5. +1
    -1
      f_check
  6. +0
    -1403
      kernel/common_param.h
  7. +89
    -61
      kernel/x86_64/dgemm_kernel_16x2_skylakex.c
  8. +1
    -1
      lapack-netlib/SRC/clargv.f
  9. +1
    -1
      lapack-netlib/SRC/clartg.f
  10. +1
    -1
      lapack-netlib/SRC/dlartg.f
  11. +1
    -1
      lapack-netlib/SRC/dlartgp.f
  12. +1
    -1
      lapack-netlib/SRC/slartg.f
  13. +1
    -1
      lapack-netlib/SRC/slartgp.f
  14. +1
    -1
      lapack-netlib/SRC/zlargv.f
  15. +1
    -1
      lapack-netlib/SRC/zlartg.f
  16. +4
    -3
      test/compare_sgemm_shgemm.c

+ 10
- 1
Makefile.system View File

@@ -783,6 +783,7 @@ endif

ifeq ($(F_COMPILER), FLANG)
CCOMMON_OPT += -DF_INTERFACE_FLANG
FCOMMON_OPT += -frecursive
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
@@ -796,6 +797,11 @@ endif
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -fopenmp
endif
ifeq ($(OSNAME), Linux)
ifeq ($(ARCH), x86_64)
FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`)
endif
endif
endif

ifeq ($(F_COMPILER), G77)
@@ -1270,8 +1276,11 @@ endif

override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)

ifeq ($(FLANG_VENDOR),AOCC)
override FFLAGS += $(filter-out -O2 -O3,$(COMMON_OPT)) -O1 $(FCOMMON_OPT)
else
override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
endif
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES =



+ 8
- 8
benchmark/Makefile View File

@@ -1825,7 +1825,7 @@ zsymv.veclib : zsymv.$(SUFFIX)
##################################### Sgeev ####################################################
sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
sgeev.acml : sgeev.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@@ -1841,7 +1841,7 @@ sgeev.veclib : sgeev.$(SUFFIX)
##################################### Dgeev ####################################################
dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
dgeev.acml : dgeev.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@@ -1858,7 +1858,7 @@ dgeev.veclib : dgeev.$(SUFFIX)
##################################### Cgeev ####################################################
cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
cgeev.acml : cgeev.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@@ -1875,7 +1875,7 @@ cgeev.veclib : cgeev.$(SUFFIX)
##################################### Zgeev ####################################################
zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
zgeev.acml : zgeev.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@@ -1891,7 +1891,7 @@ zgeev.veclib : zgeev.$(SUFFIX)
##################################### Sgetri ####################################################
sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
sgetri.acml : sgetri.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@@ -1907,7 +1907,7 @@ sgetri.veclib : sgetri.$(SUFFIX)
##################################### Dgetri ####################################################
dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
dgetri.acml : dgetri.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@@ -1924,7 +1924,7 @@ dgetri.veclib : dgetri.$(SUFFIX)
##################################### Cgetri ####################################################
cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
cgetri.acml : cgetri.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
@@ -1941,7 +1941,7 @@ cgetri.veclib : cgetri.$(SUFFIX)
##################################### Zgetri ####################################################
zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
$(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
zgetri.acml : zgetri.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)


+ 4
- 2
benchmark/zdot.c View File

@@ -170,9 +170,11 @@ int main(int argc, char *argv[]){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);

#ifdef RETURN_BY_STACK
DOT (&result , &m, x, &inc_x, y, &inc_y );
#else
result = DOT (&m, x, &inc_x, y, &inc_y );

#endif
gettimeofday( &stop, (struct timezone *)0);

time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;


+ 3
- 4
common_x86_64.h View File

@@ -80,7 +80,7 @@ static void __inline blas_lock(volatile BLASULONG *address){
#endif

do {
while (*address) {YIELDING;};
while (*address) {YIELDING;}

#ifndef C_MSVC
__asm__ __volatile__(
@@ -199,9 +199,9 @@ static __inline BLASLONG blas_quickdivide(BLASLONG x, BLASLONG y){
#else
extern unsigned int blas_quick_divide_table[];

static __inline int blas_quickdivide(unsigned int x, unsigned int y){
static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){

unsigned int result;
volatile unsigned int result;

if (y <= 1) return x;

@@ -215,7 +215,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
y = blas_quick_divide_table[y];

__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));

return result;
}
#endif


+ 1
- 1
f_check View File

@@ -334,7 +334,7 @@ if ($link ne "") {
&& ($flags !~ /kernel32/)
&& ($flags !~ /advapi32/)
&& ($flags !~ /shell32/)
&& ($flags !~ /omp/)
&& ($flags !~ /omp/ || ($vendor !~ /PGI/ && $flags =~ /omp/))
&& ($flags !~ /[0-9]+/)
&& ($flags !~ /^\-l$/)
) {


+ 0
- 1403
kernel/common_param.h
File diff suppressed because it is too large
View File


+ 89
- 61
kernel/x86_64/dgemm_kernel_16x2_skylakex.c View File

@@ -54,40 +54,40 @@
#define kernel_kstart_n10(mdim,updk) ""
#define kernel_kstart_n12(mdim,updk) ""
#define kernel_kend_n4(mdim) "xorq %3,%3;"\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24)
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16)
#define kernel_kend_n6(mdim) "xorq %3,%3;"\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56)
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48)
#define kernel_kend_n8(mdim) "xorq %3,%3;"\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88)
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80)
#define kernel_kend_n10(mdim) "xorq %3,%3;"\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120)
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112)
#define kernel_kend_n12(mdim) "xorq %3,%3;"\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8) acc_kend_nc6_k1m##mdim(0,8)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24) acc_kend_nc6_k1m##mdim(16,24)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40) acc_kend_nc6_k1m##mdim(32,40)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56) acc_kend_nc6_k1m##mdim(48,56)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72) acc_kend_nc6_k1m##mdim(64,72)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88) acc_kend_nc6_k1m##mdim(80,88)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104) acc_kend_nc6_k1m##mdim(96,104)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) acc_kend_nc6_k1m##mdim(112,120)\
loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128,136)\
loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144,152)
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0) acc_kend_nc3_k1m##mdim(0) acc_kend_nc4_k1m##mdim(0) acc_kend_nc5_k1m##mdim(0) acc_kend_nc6_k1m##mdim(0)\
loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16) acc_kend_nc3_k1m##mdim(16) acc_kend_nc4_k1m##mdim(16) acc_kend_nc5_k1m##mdim(16) acc_kend_nc6_k1m##mdim(16)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32) acc_kend_nc4_k1m##mdim(32) acc_kend_nc5_k1m##mdim(32) acc_kend_nc6_k1m##mdim(32)\
loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48) acc_kend_nc4_k1m##mdim(48) acc_kend_nc5_k1m##mdim(48) acc_kend_nc6_k1m##mdim(48)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64) acc_kend_nc5_k1m##mdim(64) acc_kend_nc6_k1m##mdim(64)\
loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80) acc_kend_nc5_k1m##mdim(80) acc_kend_nc6_k1m##mdim(80)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96) acc_kend_nc6_k1m##mdim(96)\
loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112) acc_kend_nc6_k1m##mdim(112)\
loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128)\
loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144)
#endif
#else
#define HEAD_SET_OFF(ndim) {}
@@ -129,18 +129,28 @@
#define init_update_k(mdim) ""
#define save_update_k(mdim) ""
#endif
#define KERNEL_h_k1m16n1 \
"vmovupd (%0),%%zmm1; vmovupd 64(%0),%%zmm2; addq $128,%0;"\
"vbroadcastsd (%1),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm8; vfmadd231pd %%zmm2,%%zmm3,%%zmm9;"
#define KERNEL_k1m16n1 KERNEL_h_k1m16n1 "addq $8,%1;"
#define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\
#ifdef BROADCAST_KERNEL
#define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\
"vbroadcastsd 8(%1),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm10; vfmadd231pd %%zmm2,%%zmm4,%%zmm11;"
#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;"
#define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,boff2,...)\
#define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\
"vbroadcastsd "#boff1"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"\
"vbroadcastsd "#boff2"("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";"
#define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,8,__VA_ARGS__)
"vbroadcastsd "#boff1"+8("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";"
#define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__)
#else
#define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,...)\
"vbroadcastf32x4 "#boff1"("#__VA_ARGS__"),%%zmm5; vfmadd231pd %%zmm1,%%zmm5,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm5,%%zmm"#c2_no";"\
"vfmadd231pd %%zmm3,%%zmm5,%%zmm"#c3_no"; vfmadd231pd %%zmm4,%%zmm5,%%zmm"#c4_no";"
#define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,__VA_ARGS__)
#define KERNEL_h_k1m16n2 \
"vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; vmovddup 64(%0),%%zmm3; vmovddup 72(%0),%%zmm4; addq $128,%0;"\
unit_acc_m16n2(8,9,10,11,%1)
#endif
#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;"
#define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1)
#define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;"
#define KERNEL_k1m16n6 KERNEL_h_k1m16n4 unit_acc_m16n2(16,17,18,19,%1,%%r12,2) "addq $16,%1;"
@@ -151,24 +161,42 @@
#define KERNEL_h_k1m16n12 KERNEL_h_k1m16n10 unit_acc_m16n2(28,29,30,31,%%r15,%%r12,2)
#define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%%r15;"
#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
#ifdef BROADCAST_KERNEL
#define loada_kend_k1m16 "vmovupd (%0,%3,1),%%zmm1; vmovupd 64(%0,%3,1),%%zmm2; addq $128,%3;"
#define acc_kend_nc2_k1m16(boff1,boff2) unit_acc_gen_m16n2(12,13,14,15,boff1,boff2,%1,%%r12,1)
#define acc_kend_nc3_k1m16(boff1,boff2) unit_acc_gen_m16n2(16,17,18,19,boff1,boff2,%1,%%r12,2)
#define acc_kend_nc4_k1m16(boff1,boff2) unit_acc_gen_m16n2(20,21,22,23,boff1,boff2,%%r15)
#define acc_kend_nc5_k1m16(boff1,boff2) unit_acc_gen_m16n2(24,25,26,27,boff1,boff2,%%r15,%%r12,1)
#define acc_kend_nc6_k1m16(boff1,boff2) unit_acc_gen_m16n2(28,29,30,31,boff1,boff2,%%r15,%%r12,2)
#else
#define loada_kend_k1m16 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; vmovddup 64(%0,%3,1),%%zmm3; vmovddup 72(%0,%3,1),%%zmm4; addq $128,%3;"
#endif
#define acc_kend_nc2_k1m16(boff1) unit_acc_gen_m16n2(12,13,14,15,boff1,%1,%%r12,1)
#define acc_kend_nc3_k1m16(boff1) unit_acc_gen_m16n2(16,17,18,19,boff1,%1,%%r12,2)
#define acc_kend_nc4_k1m16(boff1) unit_acc_gen_m16n2(20,21,22,23,boff1,%%r15)
#define acc_kend_nc5_k1m16(boff1) unit_acc_gen_m16n2(24,25,26,27,boff1,%%r15,%%r12,1)
#define acc_kend_nc6_k1m16(boff1) unit_acc_gen_m16n2(28,29,30,31,boff1,%%r15,%%r12,2)
#endif
#define save_init_m16 "movq %2,%3; addq $128,%2;"
#ifdef TRMMKERNEL
#define SAVE_m16n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vmulpd %%zmm9,%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;"
#ifdef BROADCAST_KERNEL
#define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
"vmulpd %%zmm"#c1_no",%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vmulpd %%zmm"#c2_no",%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\
"vmulpd %%zmm"#c3_no",%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vmulpd %%zmm"#c4_no",%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;"
#else
#define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
"vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\
"vmulpd %%zmm1,%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vmulpd %%zmm2,%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\
"vmulpd %%zmm3,%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vmulpd %%zmm4,%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;"
#endif
#else
#define SAVE_m16n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vfmadd213pd 64(%2),%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;"
#ifdef BROADCAST_KERNEL
#define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
"vfmadd213pd (%3),%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\
"vfmadd213pd (%3,%4,1),%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;"
#else
#define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
"vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vunpcklpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm2; vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm3; vunpckhpd %%zmm"#c4_no",%%zmm"#c3_no",%%zmm4;"\
"vfmadd213pd (%3),%%zmm0,%%zmm1; vmovupd %%zmm1,(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm2; vmovupd %%zmm2,64(%3);"\
"vfmadd213pd (%3,%4,1),%%zmm0,%%zmm3; vmovupd %%zmm3,(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm4; vmovupd %%zmm4,64(%3,%4,1); leaq (%3,%4,2),%3;"
#endif
#endif
#define SAVE_m16n2 save_init_m16 unit_save_m16n2(8,9,10,11)
#define SAVE_m16n4 SAVE_m16n2 unit_save_m16n2(12,13,14,15)
@@ -206,11 +234,11 @@
#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%%r15;"
#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
#define loada_kend_k1m8 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; addq $64,%3;"
#define acc_kend_nc2_k1m8(boff1,boff2) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1)
#define acc_kend_nc3_k1m8(boff1,boff2) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2)
#define acc_kend_nc4_k1m8(boff1,boff2) unit_acc_gen_m8n2(14,15,boff1,%%r15)
#define acc_kend_nc5_k1m8(boff1,boff2) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1)
#define acc_kend_nc6_k1m8(boff1,boff2) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2)
#define acc_kend_nc2_k1m8(boff1) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1)
#define acc_kend_nc3_k1m8(boff1) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2)
#define acc_kend_nc4_k1m8(boff1) unit_acc_gen_m8n2(14,15,boff1,%%r15)
#define acc_kend_nc5_k1m8(boff1) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1)
#define acc_kend_nc6_k1m8(boff1) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2)
#endif
#define save_init_m8 "movq %2,%3; addq $64,%2;"
#ifdef TRMMKERNEL
@@ -258,11 +286,11 @@
#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;"
#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
#define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;"
#define acc_kend_nc2_k1m4(boff1,boff2) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1)
#define acc_kend_nc3_k1m4(boff1,boff2) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2)
#define acc_kend_nc4_k1m4(boff1,boff2) unit_acc_gen_m4n2(10,11,boff1,%%r15)
#define acc_kend_nc5_k1m4(boff1,boff2) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1)
#define acc_kend_nc6_k1m4(boff1,boff2) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2)
#define acc_kend_nc2_k1m4(boff1) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1)
#define acc_kend_nc3_k1m4(boff1) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2)
#define acc_kend_nc4_k1m4(boff1) unit_acc_gen_m4n2(10,11,boff1,%%r15)
#define acc_kend_nc5_k1m4(boff1) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1)
#define acc_kend_nc6_k1m4(boff1) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2)
#endif
#define save_init_m4 "movq %2,%3; addq $32,%2;"
#ifdef TRMMKERNEL
@@ -311,11 +339,11 @@
#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;"
#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
#define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;"
#define acc_kend_nc2_k1m2(boff1,boff2) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1)
#define acc_kend_nc3_k1m2(boff1,boff2) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2)
#define acc_kend_nc4_k1m2(boff1,boff2) unit_acc_gen_m2n2(10,11,boff1,%%r15)
#define acc_kend_nc5_k1m2(boff1,boff2) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1)
#define acc_kend_nc6_k1m2(boff1,boff2) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2)
#define acc_kend_nc2_k1m2(boff1) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1)
#define acc_kend_nc3_k1m2(boff1) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2)
#define acc_kend_nc4_k1m2(boff1) unit_acc_gen_m2n2(10,11,boff1,%%r15)
#define acc_kend_nc5_k1m2(boff1) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1)
#define acc_kend_nc6_k1m2(boff1) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2)
#endif
#define save_init_m2 "movq %2,%3; addq $16,%2;"
#ifdef TRMMKERNEL
@@ -362,11 +390,11 @@
#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;"
#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
#define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;"
#define acc_kend_nc2_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;"
#define acc_kend_nc3_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;"
#define acc_kend_nc4_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;"
#define acc_kend_nc5_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;"
#define acc_kend_nc6_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;"
#define acc_kend_nc2_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;"
#define acc_kend_nc3_k1m1(boff1) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;"
#define acc_kend_nc4_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;"
#define acc_kend_nc5_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;"
#define acc_kend_nc6_k1m1(boff1) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;"
#endif
#define save_init_m1 "movq %2,%3; addq $8,%2;"
#ifdef TRMMKERNEL


+ 1
- 1
lapack-netlib/SRC/clargv.f View File

@@ -200,7 +200,7 @@
FS = FS*SAFMN2
GS = GS*SAFMN2
SCALE = SCALE*SAFMN2
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 )
$ GO TO 10
ELSE IF( SCALE.LE.SAFMN2 ) THEN
IF( G.EQ.CZERO ) THEN


+ 1
- 1
lapack-netlib/SRC/clartg.f View File

@@ -161,7 +161,7 @@
FS = FS*SAFMN2
GS = GS*SAFMN2
SCALE = SCALE*SAFMN2
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20)
$ GO TO 10
ELSE IF( SCALE.LE.SAFMN2 ) THEN
IF( G.EQ.CZERO.OR.SISNAN( ABS( G ) ) ) THEN


+ 1
- 1
lapack-netlib/SRC/dlartg.f View File

@@ -163,7 +163,7 @@
F1 = F1*SAFMN2
G1 = G1*SAFMN2
SCALE = MAX( ABS( F1 ), ABS( G1 ) )
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20)
$ GO TO 10
R = SQRT( F1**2+G1**2 )
CS = F1 / R


+ 1
- 1
lapack-netlib/SRC/dlartgp.f View File

@@ -161,7 +161,7 @@
F1 = F1*SAFMN2
G1 = G1*SAFMN2
SCALE = MAX( ABS( F1 ), ABS( G1 ) )
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 )
$ GO TO 10
R = SQRT( F1**2+G1**2 )
CS = F1 / R


+ 1
- 1
lapack-netlib/SRC/slartg.f View File

@@ -163,7 +163,7 @@
F1 = F1*SAFMN2
G1 = G1*SAFMN2
SCALE = MAX( ABS( F1 ), ABS( G1 ) )
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20)
$ GO TO 10
R = SQRT( F1**2+G1**2 )
CS = F1 / R


+ 1
- 1
lapack-netlib/SRC/slartgp.f View File

@@ -161,7 +161,7 @@
F1 = F1*SAFMN2
G1 = G1*SAFMN2
SCALE = MAX( ABS( F1 ), ABS( G1 ) )
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20)
$ GO TO 10
R = SQRT( F1**2+G1**2 )
CS = F1 / R


+ 1
- 1
lapack-netlib/SRC/zlargv.f View File

@@ -201,7 +201,7 @@
FS = FS*SAFMN2
GS = GS*SAFMN2
SCALE = SCALE*SAFMN2
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 )
$ GO TO 10
ELSE IF( SCALE.LE.SAFMN2 ) THEN
IF( G.EQ.CZERO ) THEN


+ 1
- 1
lapack-netlib/SRC/zlartg.f View File

@@ -161,7 +161,7 @@
FS = FS*SAFMN2
GS = GS*SAFMN2
SCALE = SCALE*SAFMN2
IF( SCALE.GE.SAFMX2 )
IF( SCALE.GE.SAFMX2 .AND. COUNT .LT. 20 )
$ GO TO 10
ELSE IF( SCALE.LE.SAFMN2 ) THEN
IF( G.EQ.CZERO.OR.DISNAN( ABS( G ) ) ) THEN


+ 4
- 3
test/compare_sgemm_shgemm.c View File

@@ -72,12 +72,13 @@ main (int argc, char *argv[])
{
int m, n, k;
int i, j, l;
int x;
int ret = 0;
int loop = 100;
char transA = 'N', transB = 'N';
float alpha = 1.0, beta = 0.0;

for (int x = 0; x <= loop; x++)
for (x = 0; x <= loop; x++)
{
m = k = n = x;
float A[m * k];
@@ -86,9 +87,9 @@ main (int argc, char *argv[])
bfloat16_bits AA[m * k], BB[k * n];
float DD[m * n], CC[m * n];

for (int j = 0; j < m; j++)
for (j = 0; j < m; j++)
{
for (int i = 0; i < m; i++)
for (i = 0; i < m; i++)
{
A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;


Loading…
Cancel
Save