Browse Source

Merge pull request #2686 from RajalakshmiSR/p10_shgemm

powerpc: Optimized SHGEMM kernel for POWER10
tags/v0.3.11^2
Martin Kroeker GitHub 6 years ago
parent
commit
c2467c9619
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 1142 additions and 74 deletions
  1. +16
    -16
      kernel/generic/gemm_ncopy_16.c
  2. +22
    -22
      kernel/generic/gemm_ncopy_8.c
  3. +13
    -13
      kernel/generic/gemm_tcopy_16.c
  4. +23
    -23
      kernel/generic/gemm_tcopy_8.c
  5. +11
    -0
      kernel/power/KERNEL.POWER10
  6. +1044
    -0
      kernel/power/shgemm_kernel_power10.c
  7. +13
    -0
      param.h

+ 16
- 16
kernel/generic/gemm_ncopy_16.c View File

@@ -39,24 +39,24 @@
#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG i, j;

FLOAT *aoffset;
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
FLOAT *boffset;
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
IFLOAT *aoffset;
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
IFLOAT *boffset;
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;

aoffset = a;
boffset = b;


+ 22
- 22
kernel/generic/gemm_ncopy_8.c View File

@@ -39,30 +39,30 @@
#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG i, j;

FLOAT *aoffset;
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
FLOAT *boffset;
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
IFLOAT *aoffset;
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
IFLOAT *boffset;
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
IFLOAT ctemp33, ctemp34, ctemp35, ctemp36;
IFLOAT ctemp37, ctemp38, ctemp39, ctemp40;
IFLOAT ctemp41, ctemp42, ctemp43, ctemp44;
IFLOAT ctemp45, ctemp46, ctemp47, ctemp48;
IFLOAT ctemp49, ctemp50, ctemp51, ctemp52;
IFLOAT ctemp53, ctemp54, ctemp55, ctemp56;
IFLOAT ctemp57, ctemp58, ctemp59, ctemp60;
IFLOAT ctemp61, ctemp62, ctemp63, ctemp64;


aoffset = a;


+ 13
- 13
kernel/generic/gemm_tcopy_16.c View File

@@ -39,22 +39,22 @@
#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){

BLASLONG i, j;

FLOAT *aoffset;
FLOAT *aoffset1, *aoffset2;
FLOAT *boffset;
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
IFLOAT *aoffset;
IFLOAT *aoffset1, *aoffset2;
IFLOAT *boffset;
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;

aoffset = a;
boffset = b;


+ 23
- 23
kernel/generic/gemm_tcopy_8.c View File

@@ -39,32 +39,32 @@
#include <stdio.h>
#include "common.h"

int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){

BLASLONG i, j;

FLOAT *aoffset;
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
IFLOAT *aoffset;
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
IFLOAT ctemp33, ctemp34, ctemp35, ctemp36;
IFLOAT ctemp37, ctemp38, ctemp39, ctemp40;
IFLOAT ctemp41, ctemp42, ctemp43, ctemp44;
IFLOAT ctemp45, ctemp46, ctemp47, ctemp48;
IFLOAT ctemp49, ctemp50, ctemp51, ctemp52;
IFLOAT ctemp53, ctemp54, ctemp55, ctemp56;
IFLOAT ctemp57, ctemp58, ctemp59, ctemp60;
IFLOAT ctemp61, ctemp62, ctemp63, ctemp64;

aoffset = a;
boffset = b;


+ 11
- 0
kernel/power/KERNEL.POWER10 View File

@@ -7,6 +7,17 @@ else
#CGEMM_BETA = ../generic/zgemm_beta.c
#ZGEMM_BETA = ../generic/zgemm_beta.c

SHGEMM_BETA = ../generic/gemm_beta.c
SHGEMMKERNEL = shgemm_kernel_power10.c
SHGEMMINCOPY = ../generic/gemm_ncopy_16.c
SHGEMMITCOPY = ../generic/gemm_tcopy_16.c
SHGEMMONCOPY = ../generic/gemm_ncopy_8.c
SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)

STRMMKERNEL = sgemm_kernel_power10.c
DTRMMKERNEL = dgemm_kernel_power10.c
CTRMMKERNEL = cgemm_kernel_power10.S


+ 1044
- 0
kernel/power/shgemm_kernel_power10.c
File diff suppressed because it is too large
View File


+ 13
- 0
param.h View File

@@ -2297,6 +2297,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#endif

#if defined(POWER10)
#undef SHGEMM_DEFAULT_UNROLL_N
#undef SHGEMM_DEFAULT_UNROLL_M
#undef SHGEMM_DEFAULT_P
#undef SHGEMM_DEFAULT_R
#undef SHGEMM_DEFAULT_Q
#define SHGEMM_DEFAULT_UNROLL_M 16
#define SHGEMM_DEFAULT_UNROLL_N 8
#define SHGEMM_DEFAULT_P 832
#define SHGEMM_DEFAULT_Q 1026
#define SHGEMM_DEFAULT_R 4096
#endif

#if defined(SPARC) && defined(V7)

#define SNUMOPT 4


Loading…
Cancel
Save