|
|
|
@@ -36,20 +36,29 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n |
|
|
|
void *c, BLASLONG ldc, int (*function)(), int nthreads); |
|
|
|
#endif |
|
|
|
|
|
|
|
#define N "x0" /* vector length */ |
|
|
|
#define X "x1" /* X vector address */ |
|
|
|
#define INC_X "x2" /* X stride */ |
|
|
|
#define I "x5" /* loop variable */ |
|
|
|
#if !defined(COMPLEX) |
|
|
|
#define N "x0" /* vector length */ |
|
|
|
#define X "x1" /* X vector address */ |
|
|
|
#define INC_X "x2" /* X stride */ |
|
|
|
#define I "x5" /* loop variable */ |
|
|
|
|
|
|
|
#define TMPF "s16" |
|
|
|
#define TMPFD "d17" |
|
|
|
#define SSQD "d0" |
|
|
|
#define TMPF "s16" |
|
|
|
#define TMPFD "d17" |
|
|
|
#define SSQD "d0" |
|
|
|
|
|
|
|
#define N_DIV_SHIFT "6" |
|
|
|
#define N_REM_MASK "63" |
|
|
|
#define INC_SHIFT "2" |
|
|
|
|
|
|
|
#define KERNEL_F1 \ |
|
|
|
"ldr "TMPF", ["X"], #4 \n" \ |
|
|
|
"fcvt "TMPFD", "TMPF" \n" \ |
|
|
|
"fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n" |
|
|
|
|
|
|
|
#define KERNEL_F \ |
|
|
|
KERNEL_F32 \ |
|
|
|
KERNEL_F32 |
|
|
|
|
|
|
|
#define KERNEL_F32 \ |
|
|
|
"ldur q16, ["X"] \n" \ |
|
|
|
"ldur q18, ["X", #16] \n" \ |
|
|
|
@@ -95,7 +104,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n |
|
|
|
"prfm PLDL1KEEP, ["X", #1024] \n" \ |
|
|
|
"prfm PLDL1KEEP, ["X", #1024+64] \n" |
|
|
|
|
|
|
|
#define KERNEL_F32_FINALIZE \ |
|
|
|
#define KERNEL_F_FINALIZE \ |
|
|
|
"fadd v0.2d, v0.2d, v1.2d \n" \ |
|
|
|
"fadd v2.2d, v2.2d, v3.2d \n" \ |
|
|
|
"fadd v4.2d, v4.2d, v5.2d \n" \ |
|
|
|
@@ -111,6 +120,93 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n |
|
|
|
"fcvt "TMPFD", "TMPF" \n" \ |
|
|
|
"fmadd "SSQD", "TMPFD", "TMPFD", "SSQD"\n" |
|
|
|
|
|
|
|
#define KERNEL_FINALIZE \ |
|
|
|
"" |
|
|
|
|
|
|
|
#else |
|
|
|
|
|
|
|
#define N "x0" /* vector length */ |
|
|
|
#define X "x1" /* X vector address */ |
|
|
|
#define INC_X "x2" /* X stride */ |
|
|
|
#define I "x5" /* loop variable */ |
|
|
|
|
|
|
|
#define TMPF "d16" |
|
|
|
#define SSQD "d0" |
|
|
|
|
|
|
|
#define N_DIV_SHIFT "4" |
|
|
|
#define N_REM_MASK "15" |
|
|
|
#define INC_SHIFT "3" |
|
|
|
|
|
|
|
#define KERNEL_F1 \ |
|
|
|
"ldr "TMPF", ["X"] \n" \ |
|
|
|
"add "X", "X", #8 \n" \ |
|
|
|
"fcvtl v16.2d, v16.2s \n" \ |
|
|
|
"fmla v0.2d, v16.2d, v16.2d \n" |
|
|
|
|
|
|
|
#define KERNEL_F \ |
|
|
|
"ldur q16, ["X"] \n" \ |
|
|
|
"ldur q18, ["X", #16] \n" \ |
|
|
|
"ldur q20, ["X", #32] \n" \ |
|
|
|
"ldur q22, ["X", #48] \n" \ |
|
|
|
"ldur q24, ["X", #64] \n" \ |
|
|
|
"ldur q26, ["X", #80] \n" \ |
|
|
|
"ldur q28, ["X", #96] \n" \ |
|
|
|
"ldur q30, ["X", #112] \n" \ |
|
|
|
"add "X", "X", #128 \n" \ |
|
|
|
"fcvtl2 v17.2d, v16.4s \n" \ |
|
|
|
"fcvtl v16.2d, v16.2s \n" \ |
|
|
|
"fcvtl2 v19.2d, v18.4s \n" \ |
|
|
|
"fcvtl v18.2d, v18.2s \n" \ |
|
|
|
"fcvtl2 v21.2d, v20.4s \n" \ |
|
|
|
"fcvtl v20.2d, v20.2s \n" \ |
|
|
|
"fcvtl2 v23.2d, v22.4s \n" \ |
|
|
|
"fcvtl v22.2d, v22.2s \n" \ |
|
|
|
"fcvtl2 v25.2d, v24.4s \n" \ |
|
|
|
"fcvtl v24.2d, v24.2s \n" \ |
|
|
|
"fcvtl2 v27.2d, v26.4s \n" \ |
|
|
|
"fcvtl v26.2d, v26.2s \n" \ |
|
|
|
"fcvtl2 v29.2d, v28.4s \n" \ |
|
|
|
"fcvtl v28.2d, v28.2s \n" \ |
|
|
|
"fcvtl2 v31.2d, v30.4s \n" \ |
|
|
|
"fcvtl v30.2d, v30.2s \n" \ |
|
|
|
"fmla v0.2d, v16.2d, v16.2d \n" \ |
|
|
|
"fmla v1.2d, v17.2d, v17.2d \n" \ |
|
|
|
"fmla v2.2d, v18.2d, v18.2d \n" \ |
|
|
|
"fmla v3.2d, v19.2d, v19.2d \n" \ |
|
|
|
"fmla v4.2d, v20.2d, v20.2d \n" \ |
|
|
|
"fmla v5.2d, v21.2d, v21.2d \n" \ |
|
|
|
"fmla v6.2d, v22.2d, v22.2d \n" \ |
|
|
|
"fmla v7.2d, v23.2d, v23.2d \n" \ |
|
|
|
"fmla v0.2d, v24.2d, v24.2d \n" \ |
|
|
|
"fmla v1.2d, v25.2d, v25.2d \n" \ |
|
|
|
"fmla v2.2d, v26.2d, v26.2d \n" \ |
|
|
|
"fmla v3.2d, v27.2d, v27.2d \n" \ |
|
|
|
"fmla v4.2d, v28.2d, v28.2d \n" \ |
|
|
|
"fmla v5.2d, v29.2d, v29.2d \n" \ |
|
|
|
"fmla v6.2d, v30.2d, v30.2d \n" \ |
|
|
|
"fmla v7.2d, v31.2d, v31.2d \n" \ |
|
|
|
"prfm PLDL1KEEP, ["X", #1024] \n" \ |
|
|
|
"prfm PLDL1KEEP, ["X", #1024+64] \n" |
|
|
|
|
|
|
|
#define KERNEL_F_FINALIZE \ |
|
|
|
"fadd v0.2d, v0.2d, v1.2d \n" \ |
|
|
|
"fadd v2.2d, v2.2d, v3.2d \n" \ |
|
|
|
"fadd v4.2d, v4.2d, v5.2d \n" \ |
|
|
|
"fadd v6.2d, v6.2d, v7.2d \n" \ |
|
|
|
"fadd v0.2d, v0.2d, v2.2d \n" \ |
|
|
|
"fadd v4.2d, v4.2d, v6.2d \n" \ |
|
|
|
"fadd v0.2d, v0.2d, v4.2d \n" |
|
|
|
|
|
|
|
#define KERNEL_FINALIZE \ |
|
|
|
"faddp "SSQD", v0.2d \n" |
|
|
|
|
|
|
|
#define KERNEL_S1 \ |
|
|
|
"ldr "TMPF", ["X"] \n" \ |
|
|
|
"add "X", "X", "INC_X" \n" \ |
|
|
|
"fcvtl v16.2d, v16.2s \n" \ |
|
|
|
"fmla v0.2d, v16.2d, v16.2d \n" |
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) |
|
|
|
{ |
|
|
|
@@ -138,20 +234,19 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) |
|
|
|
" bne .Lnrm2_kernel_S_BEGIN \n" |
|
|
|
|
|
|
|
".Lnrm2_kernel_F_BEGIN: \n" |
|
|
|
" asr "I", "N", #6 \n" |
|
|
|
" asr "I", "N", #"N_DIV_SHIFT" \n" |
|
|
|
" cmp "I", xzr \n" |
|
|
|
" beq .Lnrm2_kernel_S_BEGIN \n" |
|
|
|
|
|
|
|
" .align 5 \n" |
|
|
|
".Lnrm2_kernel_F64: \n" |
|
|
|
" "KERNEL_F32" \n" |
|
|
|
" "KERNEL_F32" \n" |
|
|
|
".Lnrm2_kernel_F: \n" |
|
|
|
" "KERNEL_F" \n" |
|
|
|
" subs "I", "I", #1 \n" |
|
|
|
" bne .Lnrm2_kernel_F64 \n" |
|
|
|
" "KERNEL_F32_FINALIZE" \n" |
|
|
|
" bne .Lnrm2_kernel_F \n" |
|
|
|
" "KERNEL_F_FINALIZE" \n" |
|
|
|
|
|
|
|
".Lnrm2_kernel_F1: \n" |
|
|
|
" ands "I", "N", #63 \n" |
|
|
|
" ands "I", "N", #"N_REM_MASK" \n" |
|
|
|
" ble .Lnrm2_kernel_L999 \n" |
|
|
|
|
|
|
|
".Lnrm2_kernel_F10: \n" |
|
|
|
@@ -161,7 +256,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) |
|
|
|
" b .Lnrm2_kernel_L999 \n" |
|
|
|
|
|
|
|
".Lnrm2_kernel_S_BEGIN: \n" |
|
|
|
" lsl "INC_X", "INC_X", #2 \n" |
|
|
|
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n" |
|
|
|
" asr "I", "N", #2 \n" |
|
|
|
" cmp "I", xzr \n" |
|
|
|
" ble .Lnrm2_kernel_S1 \n" |
|
|
|
@@ -184,6 +279,7 @@ static double nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) |
|
|
|
" bne .Lnrm2_kernel_S10 \n" |
|
|
|
|
|
|
|
".Lnrm2_kernel_L999: \n" |
|
|
|
" "KERNEL_FINALIZE" \n" |
|
|
|
" fmov %[RET_], "SSQD" \n" |
|
|
|
|
|
|
|
: [RET_] "=r" (ret) //%0 |
|
|
|
@@ -214,13 +310,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) |
|
|
|
{ |
|
|
|
#if defined(SMP) |
|
|
|
int nthreads; |
|
|
|
FLOAT dummy_alpha; |
|
|
|
FLOAT dummy_alpha[2]; |
|
|
|
#endif |
|
|
|
FLOAT nrm2 = 0.0; |
|
|
|
double nrm2_double = 0.0; |
|
|
|
|
|
|
|
if (n <= 0 || inc_x <= 0) return 0.0; |
|
|
|
if (n == 1) return fabs(x[0]); |
|
|
|
|
|
|
|
#if defined(SMP) |
|
|
|
nthreads = num_cpu_avail(1); |
|
|
|
@@ -235,7 +330,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) |
|
|
|
char result[MAX_CPU_NUMBER * sizeof(double) * 2]; |
|
|
|
double *ptr; |
|
|
|
|
|
|
|
#if !defined(COMPLEX) |
|
|
|
mode = BLAS_SINGLE | BLAS_REAL; |
|
|
|
#else |
|
|
|
mode = BLAS_SINGLE | BLAS_COMPLEX; |
|
|
|
#endif |
|
|
|
|
|
|
|
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, |
|
|
|
x, inc_x, NULL, 0, result, 0, |