powerpc: Optimized SHGEMM kernel for POWER10tags/v0.3.11^2
| @@ -39,24 +39,24 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *aoffset; | |||
| FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; | |||
| FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; | |||
| FLOAT *boffset; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| FLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| FLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| FLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| FLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| FLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; | |||
| IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; | |||
| IFLOAT *boffset; | |||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| aoffset = a; | |||
| boffset = b; | |||
| @@ -39,30 +39,30 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *aoffset; | |||
| FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| FLOAT *boffset; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| FLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| FLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| FLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| FLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| FLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| FLOAT ctemp33, ctemp34, ctemp35, ctemp36; | |||
| FLOAT ctemp37, ctemp38, ctemp39, ctemp40; | |||
| FLOAT ctemp41, ctemp42, ctemp43, ctemp44; | |||
| FLOAT ctemp45, ctemp46, ctemp47, ctemp48; | |||
| FLOAT ctemp49, ctemp50, ctemp51, ctemp52; | |||
| FLOAT ctemp53, ctemp54, ctemp55, ctemp56; | |||
| FLOAT ctemp57, ctemp58, ctemp59, ctemp60; | |||
| FLOAT ctemp61, ctemp62, ctemp63, ctemp64; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| IFLOAT *boffset; | |||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| IFLOAT ctemp33, ctemp34, ctemp35, ctemp36; | |||
| IFLOAT ctemp37, ctemp38, ctemp39, ctemp40; | |||
| IFLOAT ctemp41, ctemp42, ctemp43, ctemp44; | |||
| IFLOAT ctemp45, ctemp46, ctemp47, ctemp48; | |||
| IFLOAT ctemp49, ctemp50, ctemp51, ctemp52; | |||
| IFLOAT ctemp53, ctemp54, ctemp55, ctemp56; | |||
| IFLOAT ctemp57, ctemp58, ctemp59, ctemp60; | |||
| IFLOAT ctemp61, ctemp62, ctemp63, ctemp64; | |||
| aoffset = a; | |||
| @@ -39,22 +39,22 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *aoffset; | |||
| FLOAT *aoffset1, *aoffset2; | |||
| FLOAT *boffset; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| FLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| FLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| FLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| FLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| FLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1, *aoffset2; | |||
| IFLOAT *boffset; | |||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| aoffset = a; | |||
| boffset = b; | |||
| @@ -39,32 +39,32 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *aoffset; | |||
| FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| FLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| FLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| FLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| FLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| FLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| FLOAT ctemp33, ctemp34, ctemp35, ctemp36; | |||
| FLOAT ctemp37, ctemp38, ctemp39, ctemp40; | |||
| FLOAT ctemp41, ctemp42, ctemp43, ctemp44; | |||
| FLOAT ctemp45, ctemp46, ctemp47, ctemp48; | |||
| FLOAT ctemp49, ctemp50, ctemp51, ctemp52; | |||
| FLOAT ctemp53, ctemp54, ctemp55, ctemp56; | |||
| FLOAT ctemp57, ctemp58, ctemp59, ctemp60; | |||
| FLOAT ctemp61, ctemp62, ctemp63, ctemp64; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; | |||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| IFLOAT ctemp33, ctemp34, ctemp35, ctemp36; | |||
| IFLOAT ctemp37, ctemp38, ctemp39, ctemp40; | |||
| IFLOAT ctemp41, ctemp42, ctemp43, ctemp44; | |||
| IFLOAT ctemp45, ctemp46, ctemp47, ctemp48; | |||
| IFLOAT ctemp49, ctemp50, ctemp51, ctemp52; | |||
| IFLOAT ctemp53, ctemp54, ctemp55, ctemp56; | |||
| IFLOAT ctemp57, ctemp58, ctemp59, ctemp60; | |||
| IFLOAT ctemp61, ctemp62, ctemp63, ctemp64; | |||
| aoffset = a; | |||
| boffset = b; | |||
| @@ -7,6 +7,17 @@ else | |||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||
| #ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| SHGEMM_BETA = ../generic/gemm_beta.c | |||
| SHGEMMKERNEL = shgemm_kernel_power10.c | |||
| SHGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SHGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| SHGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
| SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRMMKERNEL = sgemm_kernel_power10.c | |||
| DTRMMKERNEL = dgemm_kernel_power10.c | |||
| CTRMMKERNEL = cgemm_kernel_power10.S | |||
| @@ -2297,6 +2297,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER10) | |||
| #undef SHGEMM_DEFAULT_UNROLL_N | |||
| #undef SHGEMM_DEFAULT_UNROLL_M | |||
| #undef SHGEMM_DEFAULT_P | |||
| #undef SHGEMM_DEFAULT_R | |||
| #undef SHGEMM_DEFAULT_Q | |||
| #define SHGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SHGEMM_DEFAULT_UNROLL_N 8 | |||
| #define SHGEMM_DEFAULT_P 832 | |||
| #define SHGEMM_DEFAULT_Q 1026 | |||
| #define SHGEMM_DEFAULT_R 4096 | |||
| #endif | |||
| #if defined(SPARC) && defined(V7) | |||
| #define SNUMOPT 4 | |||