Browse Source

fixed the performance problem in RISCV64_ZVL256 when OPENBLAS_K is small

tags/v0.3.30
guoyuanplct 11 months ago
parent
commit
2ae019161a
2 changed files with 100 additions and 1 deletions
  1. +47
    -0
      kernel/riscv64/zaxpy_vector.c
  2. +53
    -1
      kernel/riscv64/zdot_vector.c

+ 47
- 0
kernel/riscv64/zaxpy_vector.c View File

@@ -43,8 +43,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VFNMSACVF_FLOAT RISCV_RVV(vfnmsac_vf_f64m4)
#endif

#if !defined(DOUBLE)
inline int small_caxpy_kernel(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
#else
inline int small_zaxpy_kernel(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
#endif
{
BLASLONG i=0;
BLASLONG ix,iy;
BLASLONG inc_x2;
BLASLONG inc_y2;

if ( n <= 0 ) return(0);
if ( da_r == 0.0 && da_i == 0.0 ) return(0);

ix = 0;
iy = 0;

inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;

while(i < n)
{
#if !defined(CONJ)
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;

}
return(0);

}

int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
#if !defined(DOUBLE)
if(n < 16) {
return small_caxpy_kernel(n, dummy0, dummy1, da_r, da_i, x, inc_x, y, inc_y, dummy, dummy2);
}
#else
if(n < 8) {
return small_zaxpy_kernel(n, dummy0, dummy1, da_r, da_i, x, inc_x, y, inc_y, dummy, dummy2);
}
#endif
BLASLONG i = 0, j = 0;
BLASLONG ix = 0,iy = 0;
if(n <= 0) return(0);


+ 53
- 1
kernel/riscv64/zdot_vector.c View File

@@ -68,8 +68,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VFNMSACVV_FLOAT RISCV_RVV(vfnmsac_vv_f64m4)
#endif

#if !defined(DOUBLE)
inline OPENBLAS_COMPLEX_FLOAT small_cdot_kernel(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#else
inline OPENBLAS_COMPLEX_FLOAT small_zdot_kernel(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
#endif
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT dot[2];
OPENBLAS_COMPLEX_FLOAT result;
BLASLONG inc_x2;
BLASLONG inc_y2;

dot[0]=0.0;
dot[1]=0.0;

CREAL(result) = 0.0 ;
CIMAG(result) = 0.0 ;

if ( n < 1 ) return(result);

inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;

while(i < n)
{
#if !defined(CONJ)
dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ;
dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ;
#else
dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ;
dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ;
#endif
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;

}
CREAL(result) = dot[0];
CIMAG(result) = dot[1];
return(result);

}
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
#if !defined(DOUBLE)
if(n < 16) {
return small_cdot_kernel(n, x, inc_x, y, inc_y);
}
#else
if(n < 8) {
return small_zdot_kernel(n, x, inc_x, y, inc_y);
}
#endif
BLASLONG i=0, j=0;
BLASLONG ix=0,iy=0;
FLOAT dot[2];
@@ -148,4 +200,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
CREAL(result) = dot[0];
CIMAG(result) = dot[1];
return(result);
}
}

Loading…
Cancel
Save