Browse Source

Split the microkernel workload into chunks of 32 floats for dsdot mode to limit loss of precision

tags/v0.3.0
Martin Kroeker GitHub 8 years ago
parent
commit
5e3e91d0fc
1 changed files with 22 additions and 6 deletions
  1. +22
    -6
      kernel/x86_64/sdot.c

+ 22
- 6
kernel/x86_64/sdot.c View File

@@ -78,7 +78,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
BLASLONG ix=0,iy=0;
double dot = 0.0 ;

#if defined (DSDOT)
double mydot = 0.0;
FLOAT asmdot = 0.0;
#else
FLOAT mydot=0.0;
#endif
BLASLONG n1;

if ( n <= 0 ) return(dot);
@@ -89,9 +94,23 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
n1 = n & (BLASLONG)(-32);

if ( n1 )
#if defined(DSDOT)
{
FLOAT *x1=x;
FLOAT *y1=y;
BLASLONG n2 = 32;
while (i<n1) {
sdot_kernel_16(n2, x1, y1 , &asmdot );
mydot += (double)asmdot;
asmdot=0.;
x1+=32;
y1+=32;
i+=32;
}
}
#else
sdot_kernel_16(n1, x, y , &mydot );


#endif
i = n1;
while(i < n)
{
@@ -103,11 +122,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
i++ ;

}
#if defined(DSDOT)
dot+=(double)mydot;
#else

dot+=mydot;
#endif
return(dot);




Loading…
Cancel
Save