|
|
|
@@ -72,7 +72,7 @@ f32tobfloat16(float f32) |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, |
|
|
|
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta_in, |
|
|
|
IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, |
|
|
|
FLOAT *c, BLASLONG ldc){ |
|
|
|
|
|
|
|
@@ -83,6 +83,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, |
|
|
|
c_offset = c; |
|
|
|
chunk = m >> 3; |
|
|
|
remain = m & 7; |
|
|
|
float beta = BF16TOF32(beta_in); |
|
|
|
|
|
|
|
if (beta == ZERO){ |
|
|
|
for(j=n; j>0; j--){ |
|
|
|
c_offset1 = c_offset; |
|
|
|
@@ -108,18 +110,18 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, |
|
|
|
c_offset1 = c_offset; |
|
|
|
c_offset += ldc; |
|
|
|
for(i=chunk; i>0; i--){ |
|
|
|
*(c_offset1 + 0) *= beta; |
|
|
|
*(c_offset1 + 1) *= beta; |
|
|
|
*(c_offset1 + 2) *= beta; |
|
|
|
*(c_offset1 + 3) *= beta; |
|
|
|
*(c_offset1 + 4) *= beta; |
|
|
|
*(c_offset1 + 5) *= beta; |
|
|
|
*(c_offset1 + 6) *= beta; |
|
|
|
*(c_offset1 + 7) *= beta; |
|
|
|
*(c_offset1 + 0) = F32TOBF16(beta * BF16TOF32(c_offset1[0])); |
|
|
|
*(c_offset1 + 1) = F32TOBF16(beta * BF16TOF32(c_offset1[1])); |
|
|
|
*(c_offset1 + 2) = F32TOBF16(beta * BF16TOF32(c_offset1[2])); |
|
|
|
*(c_offset1 + 3) = F32TOBF16(beta * BF16TOF32(c_offset1[3])); |
|
|
|
*(c_offset1 + 4) = F32TOBF16(beta * BF16TOF32(c_offset1[4])); |
|
|
|
*(c_offset1 + 5) = F32TOBF16(beta * BF16TOF32(c_offset1[5])); |
|
|
|
*(c_offset1 + 6) = F32TOBF16(beta * BF16TOF32(c_offset1[6])); |
|
|
|
*(c_offset1 + 7) = F32TOBF16(beta * BF16TOF32(c_offset1[7])); |
|
|
|
c_offset1 += 8; |
|
|
|
} |
|
|
|
for(i=remain; i>0; i--){ |
|
|
|
*c_offset1 *= beta; |
|
|
|
*c_offset1 = F32TOBF16(beta * BF16TOF32(c_offset1[0])); |
|
|
|
c_offset1 ++; |
|
|
|
} |
|
|
|
} |
|
|
|
|