SVE implementation of gemv , scal , swap and rot BLAS routines files has been addedpull/4940/head
| @@ -0,0 +1,41 @@ | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| #ifdef DOUBLE | |||
| #define SVE_TYPE svfloat64_t | |||
| #define SVE_ZERO svdup_f64(0.0) | |||
| #define SVE_WHILELT svwhilelt_b64 | |||
| #define SVE_ALL svptrue_b64() | |||
| #define SVE_WIDTH svcntd() | |||
| #else | |||
| #define SVE_TYPE svfloat32_t | |||
| #define SVE_ZERO svdup_f32(0.0) | |||
| #define SVE_WHILELT svwhilelt_b32 | |||
| #define SVE_ALL svptrue_b32() | |||
| #define SVE_WIDTH svcntw() | |||
| #endif | |||
| static FLOAT dgemv_kernel_sve(BLASLONG i, FLOAT *x, BLASLONG lda, FLOAT *y, BLASLONG incx, BLASLONG n){ | |||
| SVE_TYPE acc_a = SVE_ZERO; | |||
| SVE_TYPE acc_b = SVE_ZERO; | |||
| BLASLONG sve_width = SVE_WIDTH; | |||
| for (BLASLONG j = 0; j < n; j += sve_width * 2) { | |||
| svbool_t pg_a = SVE_WHILELT(j, n); | |||
| svbool_t pg_b = SVE_WHILELT(j + sve_width, n); | |||
| SVE_TYPE x_vec_a = svld1(pg_a, &x[i*lda+j]); | |||
| SVE_TYPE y_vec_a = svld1(pg_a, &y[j*incx]); | |||
| SVE_TYPE x_vec_b = svld1(pg_b, &x[i*lda+j + sve_width]); | |||
| SVE_TYPE y_vec_b = svld1(pg_b, &y[j*incx + sve_width]); | |||
| acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a); | |||
| acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b); | |||
| } | |||
| return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b); | |||
| } | |||
| @@ -0,0 +1,32 @@ | |||
| #include "common.h" | |||
| // Some compilers will report feature support for SVE without the appropriate | |||
| // header available | |||
| #ifdef HAVE_SVE | |||
| #if defined __has_include | |||
| #if __has_include(<arm_sve.h>) && __ARM_FEATURE_SVE | |||
| #define USE_SVE | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #include "dgemv_kernel_sve.c" | |||
| #include "dgemv_kernel_c.c" | |||
| int CNAME(BLASLONG m, BLASLONG n , BLASLONG dummy, FLOAT alpha, FLOAT* a, BLASLONG lda , FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ | |||
| if ( incx == 1 && incy == 1){ | |||
| // if(alpha!=1) for(BLASLONG i=0; i<n; ++i)X[i]=alpha*X[i]; | |||
| for(BLASLONG i=0; i<n; ++i){ | |||
| // Y[i*incy]+= dgemv_kernel_sve(i,A,lda,X,incx,n); | |||
| y[i]+= dgemv_kernel_sve(i,lda,a,m,x,alpha,n); | |||
| } | |||
| } | |||
| // BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| else dgemv_kernel_c( m, n, dummy, alpha, a, lda, x, incx, y, incy, buffer ); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,19 @@ | |||
| #include "common.h" | |||
| #include "rot_kernel_sve.c" | |||
| #include "rot_kernel_c.c" | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| if ( n <= 0 ) return(0); | |||
| if ( inc_x == 1 && inc_y==1) | |||
| rot_kernel_sve( n, x, y, c, s); | |||
| else | |||
| rot_kernel_c ( n, x, inc_x, y, inc_y, c, s); | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,25 @@ | |||
| #include "common.h" | |||
| static int rot_kernel_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp; | |||
| if ( n <= 0 ) return(0); | |||
| while(i < n) | |||
| { | |||
| temp = c*x[ix] + s*y[iy] ; | |||
| y[iy] = c*y[iy] - s*x[ix] ; | |||
| x[ix] = temp ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,38 @@ | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| #ifdef DOUBLE | |||
| #define SVE_TYPE svfloat64_t | |||
| #define SVE_ZERO svdup_f64(0.0) | |||
| #define SVE_WHILELT svwhilelt_b64 | |||
| #define SVE_ALL svptrue_b64() | |||
| #define SVE_WIDTH svcntd() | |||
| #else | |||
| #define SVE_TYPE svfloat32_t | |||
| #define SVE_ZERO svdup_f32(0.0) | |||
| #define SVE_WHILELT svwhilelt_b32 | |||
| #define SVE_ALL svptrue_b32() | |||
| #define SVE_WIDTH svcntw() | |||
| #endif | |||
| static void rot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s){ | |||
| for(int i=0; i<n; i+=SVE_WIDTH){ | |||
| svbool_t pg =SVE_WHILELT((uint32_t)i,(uint32_t) n); | |||
| SVE_TYPE x_vec = svld1(pg, &x[i]); | |||
| SVE_TYPE y_vec = svld1(pg, &y[i]); | |||
| SVE_TYPE cx_vec=svmul_z(pg,x_vec,c); | |||
| SVE_TYPE sy_vec=svmul_z(pg,y_vec,s); | |||
| SVE_TYPE sx_vec=svmul_z(pg,x_vec,s); | |||
| SVE_TYPE cy_vec=svmul_z(pg,y_vec,c); | |||
| svst1(pg,&x[i],svadd_z(pg,cx_vec,sy_vec)); | |||
| svst1(pg,&y[i],svsub_z(pg,cy_vec,sx_vec)); | |||
| } | |||
| } | |||
| @@ -0,0 +1,20 @@ | |||
| #include "common.h" | |||
| #include "scal_kernel_sve.c" | |||
| #include "scal_kernel_c.c" | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| if ( (n <= 0) || (inc_x <= 0)) | |||
| return(0); | |||
| if (inc_x == 1) | |||
| scal_kernel_sve( n, x, da); | |||
| else | |||
| scal_kernel_c(n,dummy0,dummy1,da,x,inc_x,y,inc_y,dummy,dummy2); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,23 @@ | |||
| #include "common.h" | |||
| static int scal_kernel_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0,j=0; | |||
| while(j < n) | |||
| { | |||
| if ( da == 0.0 ) | |||
| x[i]=0.0; | |||
| else | |||
| x[i] = da * x[i] ; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,28 @@ | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| #ifdef DOUBLE | |||
| #define SVE_TYPE svfloat64_t | |||
| #define SVE_ZERO svdup_f64(0.0) | |||
| #define SVE_WHILELT svwhilelt_b64 | |||
| #define SVE_ALL svptrue_b64() | |||
| #define SVE_WIDTH svcntd() | |||
| #else | |||
| #define SVE_TYPE svfloat32_t | |||
| #define SVE_ZERO svdup_f32(0.0) | |||
| #define SVE_WHILELT svwhilelt_b32 | |||
| #define SVE_ALL svptrue_b32() | |||
| #define SVE_WIDTH svcntw() | |||
| #endif | |||
| static int scal_kernel_sve(int n, FLOAT *x, FLOAT da) | |||
| { | |||
| for (int i = 0; i < n; i += SVE_WIDTH){ | |||
| svbool_t pg = SVE_WHILELT(i, n); | |||
| SVE_TYPE x_vec = svld1(pg, &x[i]); | |||
| SVE_TYPE result= svmul_z(pg,x_vec,da); | |||
| svst1(pg,&x[i],result); | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,22 @@ | |||
| #include "common.h" | |||
| #ifdef HAVE_SVE | |||
| #if defined __has_include | |||
| #if __has_include(<arm_sve.h>) && __ARM_FEATURE_SVE | |||
| #define USE_SVE | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #include "swap_kernel_sve.c" | |||
| //(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG) | |||
| //int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT* dummy, BLASLONG dummy2) | |||
| { | |||
| swap_kernel_sve(n, x,inc_x, y, inc_y); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,37 @@ | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| #ifdef DOUBLE | |||
| #define SVE_TYPE svfloat64_t | |||
| #define SVE_ZERO svdup_f64(0.0) | |||
| #define SVE_WHILELT svwhilelt_b64 | |||
| #define SVE_ALL svptrue_b64() | |||
| #define SVE_WIDTH svcntd() | |||
| #else | |||
| #define SVE_TYPE svfloat32_t | |||
| #define SVE_ZERO svdup_f32(0.0) | |||
| #define SVE_WHILELT svwhilelt_b32 | |||
| #define SVE_ALL svptrue_b32() | |||
| #define SVE_WIDTH svcntw() | |||
| #endif | |||
| static int swap_kernel_sve(BLASLONG n, FLOAT *x,BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||
| BLASLONG sve_width = SVE_WIDTH; | |||
| for (BLASLONG i = 0; i < n; i += sve_width * 2) { | |||
| svbool_t pg_a = SVE_WHILELT(i, n); | |||
| svbool_t pg_b = SVE_WHILELT((i + sve_width), n); | |||
| SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); | |||
| SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); | |||
| SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); | |||
| SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); | |||
| svst1(pg_a, &x[i], y_vec_a); | |||
| svst1(pg_a, &y[i], x_vec_a); | |||
| svst1(pg_b, &x[i+sve_width], y_vec_b); | |||
| svst1(pg_b, &y[i+sve_width], x_vec_b); | |||
| } | |||
| return 0; | |||
| } | |||