Browse Source

Simplify gemv_t_sve_v1x3 kernel

tags/v0.3.30
Annop Wongwathanarat 1 year ago
parent
commit
c0318cea6e
2 changed files with 58 additions and 39 deletions
  1. +3
    -0
      CONTRIBUTORS.md
  2. +55
    -39
      kernel/arm64/gemv_t_sve_v1x3.c

+ 3
- 0
CONTRIBUTORS.md View File

@@ -232,3 +232,6 @@ In chronological order:


* Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32> * Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32>
* [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE * [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE

* Annop Wongwathanarat <annop.wongwathanarat@arm.com>
* [2025-01-21] Optimize gemv_t_sve_v1x3 kernel

+ 55
- 39
kernel/arm64/gemv_t_sve_v1x3.c View File

@@ -1,5 +1,5 @@
/*************************************************************************** /***************************************************************************
Copyright (c) 2024, The OpenBLAS Project
Copyright (c) 2024, 2025 The OpenBLAS Project
All rights reserved. All rights reserved.


Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@@ -56,12 +56,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
BLASLONG ix,iy; BLASLONG ix,iy;
BLASLONG j; BLASLONG j;
FLOAT *a_ptr; FLOAT *a_ptr;
FLOAT *y_ptr;
FLOAT temp; FLOAT temp;


iy = 0; iy = 0;


if (inc_x == 1) { if (inc_x == 1) {
BLASLONG width = (n + 3 - 1) / 3;
BLASLONG width = n / 3;
BLASLONG sve_size = SV_COUNT();
svbool_t pg_true = SV_TRUE();
svbool_t pg = SV_WHILE(0, m % sve_size);


FLOAT *a0_ptr = a + lda * width * 0; FLOAT *a0_ptr = a + lda * width * 0;
FLOAT *a1_ptr = a + lda * width * 1; FLOAT *a1_ptr = a + lda * width * 1;
@@ -72,60 +76,41 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
FLOAT *y2_ptr = y + inc_y * width * 2; FLOAT *y2_ptr = y + inc_y * width * 2;


for (j = 0; j < width; j++) { for (j = 0; j < width; j++) {
svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();

SV_TYPE temp00_vec = SV_DUP(0.0); SV_TYPE temp00_vec = SV_DUP(0.0);
SV_TYPE temp01_vec = SV_DUP(0.0); SV_TYPE temp01_vec = SV_DUP(0.0);
SV_TYPE temp02_vec = SV_DUP(0.0); SV_TYPE temp02_vec = SV_DUP(0.0);


i = 0; i = 0;
BLASLONG sve_size = SV_COUNT();
while ((i + sve_size * 1 - 1) < m) { while ((i + sve_size * 1 - 1) < m) {
SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0);
SV_TYPE x0_vec = svld1(pg_true, x + i);


SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);
SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i);
SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i);
SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i);


temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec);
temp00_vec = svmla_x(pg_true, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_x(pg_true, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_x(pg_true, temp02_vec, a02_vec, x0_vec);


i += sve_size * 1; i += sve_size * 1;
} }


if (i < m) { if (i < m) {
svbool_t pg0 = SV_WHILE(i + sve_size * 0, m);

pg00 = svand_z(SV_TRUE(), pg0, pg00);
pg01 = svand_z(SV_TRUE(), pg0, pg01);
pg02 = svand_z(SV_TRUE(), pg0, pg02);
SV_TYPE x0_vec = svld1(pg, x + i);


SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0);
SV_TYPE a00_vec = svld1(pg, a0_ptr + i);
SV_TYPE a01_vec = svld1(pg, a1_ptr + i);
SV_TYPE a02_vec = svld1(pg, a2_ptr + i);


SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);

temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec);
temp00_vec = svmla_m(pg, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_m(pg, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_m(pg, temp02_vec, a02_vec, x0_vec);
} }


if ((j + width * 0) < n) {
temp = svaddv(SV_TRUE(), temp00_vec);
y0_ptr[iy] += alpha * temp;
}
if ((j + width * 1) < n) {
temp = svaddv(SV_TRUE(), temp01_vec);
y1_ptr[iy] += alpha * temp;
}
if ((j + width * 2) < n) {
temp = svaddv(SV_TRUE(), temp02_vec);
y2_ptr[iy] += alpha * temp;
}
y0_ptr[iy] += alpha * svaddv(pg_true, temp00_vec);
y1_ptr[iy] += alpha * svaddv(pg_true, temp01_vec);
y2_ptr[iy] += alpha * svaddv(pg_true, temp02_vec);

iy += inc_y; iy += inc_y;


a0_ptr += lda; a0_ptr += lda;
@@ -133,6 +118,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
a2_ptr += lda; a2_ptr += lda;
} }


a_ptr = a2_ptr;
y_ptr = y2_ptr;
for (j = width * 3; j < n; j++) {
SV_TYPE temp_vec = SV_DUP(0.0);

i = 0;
while ((i + sve_size * 1 - 1) < m) {
SV_TYPE x_vec = svld1(pg_true, x + i);

SV_TYPE a_vec = svld1(pg_true, a_ptr + i);

temp_vec = svmla_x(pg_true, temp_vec, a_vec, x_vec);

i += sve_size * 1;
}

if (i < m) {
SV_TYPE x_vec = svld1(pg, x + i);

SV_TYPE a_vec = svld1(pg, a_ptr + i);

temp_vec = svmla_m(pg, temp_vec, a_vec, x_vec);
}

y_ptr[iy] += alpha * svaddv(pg_true, temp_vec);

iy += inc_y;

a_ptr += lda;
}

return(0); return(0);
} }




Loading…
Cancel
Save