Browse Source

Simplify gemv_t_sve_v1x3 kernel

tags/v0.3.30
Annop Wongwathanarat 1 year ago
parent
commit
c0318cea6e
2 changed files with 58 additions and 39 deletions
  1. +3
    -0
      CONTRIBUTORS.md
  2. +55
    -39
      kernel/arm64/gemv_t_sve_v1x3.c

+ 3
- 0
CONTRIBUTORS.md View File

@@ -232,3 +232,6 @@ In chronological order:

* Aniket P. Garade <https://github.com/garadeaniket> Sushil Pratap Singh <https://github.com/SushilPratap04> Juliya James <https://github.com/Juliya32>
* [2024-12-13] Optimized swap and rot Level-1 BLAS routines with ARM SVE

* Annop Wongwathanarat <annop.wongwathanarat@arm.com>
* [2025-01-21] Optimize gemv_t_sve_v1x3 kernel

+ 55
- 39
kernel/arm64/gemv_t_sve_v1x3.c View File

@@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2024, The OpenBLAS Project
Copyright (c) 2024, 2025 The OpenBLAS Project
All rights reserved.

Redistribution and use in source and binary forms, with or without
@@ -56,12 +56,16 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
BLASLONG ix,iy;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *y_ptr;
FLOAT temp;

iy = 0;

if (inc_x == 1) {
BLASLONG width = (n + 3 - 1) / 3;
BLASLONG width = n / 3;
BLASLONG sve_size = SV_COUNT();
svbool_t pg_true = SV_TRUE();
svbool_t pg = SV_WHILE(0, m % sve_size);

FLOAT *a0_ptr = a + lda * width * 0;
FLOAT *a1_ptr = a + lda * width * 1;
@@ -72,60 +76,41 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
FLOAT *y2_ptr = y + inc_y * width * 2;

for (j = 0; j < width; j++) {
svbool_t pg00 = ((j + width * 0) < n) ? SV_TRUE() : svpfalse();
svbool_t pg01 = ((j + width * 1) < n) ? SV_TRUE() : svpfalse();
svbool_t pg02 = ((j + width * 2) < n) ? SV_TRUE() : svpfalse();

SV_TYPE temp00_vec = SV_DUP(0.0);
SV_TYPE temp01_vec = SV_DUP(0.0);
SV_TYPE temp02_vec = SV_DUP(0.0);

i = 0;
BLASLONG sve_size = SV_COUNT();
while ((i + sve_size * 1 - 1) < m) {
SV_TYPE x0_vec = svld1_vnum(SV_TRUE(), x + i, 0);
SV_TYPE x0_vec = svld1(pg_true, x + i);

SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);
SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i);
SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i);
SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i);

temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec);
temp00_vec = svmla_x(pg_true, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_x(pg_true, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_x(pg_true, temp02_vec, a02_vec, x0_vec);

i += sve_size * 1;
}

if (i < m) {
svbool_t pg0 = SV_WHILE(i + sve_size * 0, m);

pg00 = svand_z(SV_TRUE(), pg0, pg00);
pg01 = svand_z(SV_TRUE(), pg0, pg01);
pg02 = svand_z(SV_TRUE(), pg0, pg02);
SV_TYPE x0_vec = svld1(pg, x + i);

SV_TYPE x0_vec = svld1_vnum(pg0, x + i, 0);
SV_TYPE a00_vec = svld1(pg, a0_ptr + i);
SV_TYPE a01_vec = svld1(pg, a1_ptr + i);
SV_TYPE a02_vec = svld1(pg, a2_ptr + i);

SV_TYPE a00_vec = svld1_vnum(pg00, a0_ptr + i, 0);
SV_TYPE a01_vec = svld1_vnum(pg01, a1_ptr + i, 0);
SV_TYPE a02_vec = svld1_vnum(pg02, a2_ptr + i, 0);

temp00_vec = svmla_m(pg00, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_m(pg01, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_m(pg02, temp02_vec, a02_vec, x0_vec);
temp00_vec = svmla_m(pg, temp00_vec, a00_vec, x0_vec);
temp01_vec = svmla_m(pg, temp01_vec, a01_vec, x0_vec);
temp02_vec = svmla_m(pg, temp02_vec, a02_vec, x0_vec);
}

if ((j + width * 0) < n) {
temp = svaddv(SV_TRUE(), temp00_vec);
y0_ptr[iy] += alpha * temp;
}
if ((j + width * 1) < n) {
temp = svaddv(SV_TRUE(), temp01_vec);
y1_ptr[iy] += alpha * temp;
}
if ((j + width * 2) < n) {
temp = svaddv(SV_TRUE(), temp02_vec);
y2_ptr[iy] += alpha * temp;
}
y0_ptr[iy] += alpha * svaddv(pg_true, temp00_vec);
y1_ptr[iy] += alpha * svaddv(pg_true, temp01_vec);
y2_ptr[iy] += alpha * svaddv(pg_true, temp02_vec);

iy += inc_y;

a0_ptr += lda;
@@ -133,6 +118,37 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a,
a2_ptr += lda;
}

a_ptr = a2_ptr;
y_ptr = y2_ptr;
for (j = width * 3; j < n; j++) {
SV_TYPE temp_vec = SV_DUP(0.0);

i = 0;
while ((i + sve_size * 1 - 1) < m) {
SV_TYPE x_vec = svld1(pg_true, x + i);

SV_TYPE a_vec = svld1(pg_true, a_ptr + i);

temp_vec = svmla_x(pg_true, temp_vec, a_vec, x_vec);

i += sve_size * 1;
}

if (i < m) {
SV_TYPE x_vec = svld1(pg, x + i);

SV_TYPE a_vec = svld1(pg, a_ptr + i);

temp_vec = svmla_m(pg, temp_vec, a_vec, x_vec);
}

y_ptr[iy] += alpha * svaddv(pg_true, temp_vec);

iy += inc_y;

a_ptr += lda;
}

return(0);
}



Loading…
Cancel
Save