| @@ -20,25 +20,36 @@ IDMAXKERNEL = ../arm/imax.c | |||||
| ISMINKERNEL = ../arm/imin.c | ISMINKERNEL = ../arm/imin.c | ||||
| IDMINKERNEL = ../arm/imin.c | IDMINKERNEL = ../arm/imin.c | ||||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| STRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||||
| STRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||||
| STRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||||
| STRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||||
| DTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||||
| DTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||||
| DTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||||
| DTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||||
| TRSMCOPYLN_M = trsm_lncopy_sve.c | |||||
| TRSMCOPYLT_M = trsm_ltcopy_sve.c | |||||
| TRSMCOPYUN_M = trsm_uncopy_sve.c | |||||
| TRSMCOPYUT_M = trsm_utcopy_sve.c | |||||
| CTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||||
| CTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||||
| CTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||||
| CTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||||
| ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||||
| ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||||
| ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||||
| ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||||
| ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c | |||||
| ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c | |||||
| ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c | |||||
| ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c | |||||
| SAMAXKERNEL = amax.S | SAMAXKERNEL = amax.S | ||||
| DAMAXKERNEL = amax.S | DAMAXKERNEL = amax.S | ||||
| @@ -167,7 +167,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||||
| BLASLONG i, j; | BLASLONG i, j; | ||||
| FLOAT *aa, *cc; | FLOAT *aa, *cc; | ||||
| BLASLONG kk; | BLASLONG kk; | ||||
| #ifdef DOUBLE | |||||
| int sve_size = svcntd(); | int sve_size = svcntd(); | ||||
| #else | |||||
| int sve_size = svcntw(); | |||||
| #endif | |||||
| #if 0 | #if 0 | ||||
| fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", | fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", | ||||
| @@ -157,7 +157,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||||
| FLOAT *aa, *cc; | FLOAT *aa, *cc; | ||||
| BLASLONG kk; | BLASLONG kk; | ||||
| BLASLONG i, j, jj; | BLASLONG i, j, jj; | ||||
| #ifdef DOUBLE | |||||
| int sve_size = svcntd(); | int sve_size = svcntd(); | ||||
| #else | |||||
| int sve_size = svcntw(); | |||||
| #endif | |||||
| #if 0 | #if 0 | ||||
| fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", | fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", | ||||
| @@ -157,7 +157,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||||
| FLOAT *aa, *cc; | FLOAT *aa, *cc; | ||||
| BLASLONG kk; | BLASLONG kk; | ||||
| BLASLONG i, j, jj; | BLASLONG i, j, jj; | ||||
| #ifdef DOUBLE | |||||
| int sve_size = svcntd(); | int sve_size = svcntd(); | ||||
| #else | |||||
| int sve_size = svcntw(); | |||||
| #endif | |||||
| #if 0 | #if 0 | ||||
| fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", | fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", | ||||
| @@ -169,7 +169,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||||
| BLASLONG i, j; | BLASLONG i, j; | ||||
| FLOAT *aa, *cc; | FLOAT *aa, *cc; | ||||
| BLASLONG kk; | BLASLONG kk; | ||||
| #ifdef DOUBLE | |||||
| int sve_size = svcntd(); | int sve_size = svcntd(); | ||||
| #else | |||||
| int sve_size = svcntw(); | |||||
| #endif | |||||
| #if 0 | #if 0 | ||||
| fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", | fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", | ||||
| @@ -59,9 +59,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||||
| svbool_t pn = svwhilelt_b64(js, n); | svbool_t pn = svwhilelt_b64(js, n); | ||||
| int n_active = svcntp_b64(svptrue_b64(), pn); | int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | #else | ||||
| int32_t N = n; | |||||
| int32_t js = 0; | int32_t js = 0; | ||||
| svint32_t index = svindex_s32(0, lda); | svint32_t index = svindex_s32(0, lda); | ||||
| svbool_t pn = svwhilelt_b32(js, n); | |||||
| svbool_t pn = svwhilelt_b32(js, N); | |||||
| int n_active = svcntp_b32(svptrue_b32(), pn); | int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | #endif | ||||
| do { | do { | ||||
| @@ -85,7 +86,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||||
| ii += n_active; | ii += n_active; | ||||
| } else { | } else { | ||||
| if (ii > jj) { | if (ii > jj) { | ||||
| #ifdef DOUBLE | |||||
| svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | ||||
| #else | |||||
| svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | |||||
| #endif | |||||
| svst1(pn, b, aj_vec); | svst1(pn, b, aj_vec); | ||||
| } | } | ||||
| ao++; | ao++; | ||||
| @@ -105,7 +110,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||||
| n_active = svcntp_b64(svptrue_b64(), pn); | n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| } while (svptest_any(svptrue_b64(), pn)); | } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | #else | ||||
| pn = svwhilelt_b32(js, n); | |||||
| pn = svwhilelt_b32(js, N); | |||||
| n_active = svcntp_b32(svptrue_b32(), pn); | n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| } while (svptest_any(svptrue_b32(), pn)); | } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | #endif | ||||
| @@ -58,8 +58,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||||
| svbool_t pn = svwhilelt_b64(js, n); | svbool_t pn = svwhilelt_b64(js, n); | ||||
| int n_active = svcntp_b64(svptrue_b64(), pn); | int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | #else | ||||
| int32_t N = n; | |||||
| int32_t js = 0; | int32_t js = 0; | ||||
| svbool_t pn = svwhilelt_b32(js, n); | |||||
| svbool_t pn = svwhilelt_b32(js, N); | |||||
| int n_active = svcntp_b32(svptrue_b32(), pn); | int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | #endif | ||||
| do { | do { | ||||
| @@ -83,7 +84,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||||
| ii += n_active; | ii += n_active; | ||||
| } else { | } else { | ||||
| if (ii < jj) { | if (ii < jj) { | ||||
| #ifdef DOUBLE | |||||
| svfloat64_t aj_vec = svld1(pn, ao); | svfloat64_t aj_vec = svld1(pn, ao); | ||||
| #else | |||||
| svfloat32_t aj_vec = svld1(pn, ao); | |||||
| #endif | |||||
| svst1(pn, b, aj_vec); | svst1(pn, b, aj_vec); | ||||
| } | } | ||||
| ao += lda; | ao += lda; | ||||
| @@ -103,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||||
| n_active = svcntp_b64(svptrue_b64(), pn); | n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| } while (svptest_any(svptrue_b64(), pn)); | } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | #else | ||||
| pn = svwhilelt_b32(js, n); | |||||
| pn = svwhilelt_b32(js, N); | |||||
| n_active = svcntp_b32(svptrue_b32(), pn); | n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| } while (svptest_any(svptrue_b32(), pn)); | } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | #endif | ||||
| @@ -59,9 +59,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||||
| svbool_t pn = svwhilelt_b64(js, n); | svbool_t pn = svwhilelt_b64(js, n); | ||||
| int n_active = svcntp_b64(svptrue_b64(), pn); | int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | #else | ||||
| int32_t N = n; | |||||
| int32_t js = 0; | int32_t js = 0; | ||||
| svint32_t index = svindex_s32(0, lda); | svint32_t index = svindex_s32(0, lda); | ||||
| svbool_t pn = svwhilelt_b32(js, n); | |||||
| svbool_t pn = svwhilelt_b32(js, N); | |||||
| int n_active = svcntp_b32(svptrue_b32(), pn); | int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | #endif | ||||
| do { | do { | ||||
| @@ -85,7 +86,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||||
| ii += n_active; | ii += n_active; | ||||
| } else { | } else { | ||||
| if (ii < jj) { | if (ii < jj) { | ||||
| #ifdef DOUBLE | |||||
| svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | ||||
| #else | |||||
| svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | |||||
| #endif | |||||
| svst1(pn, b, aj_vec); | svst1(pn, b, aj_vec); | ||||
| } | } | ||||
| ao++; | ao++; | ||||
| @@ -105,7 +110,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||||
| n_active = svcntp_b64(svptrue_b64(), pn); | n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| } while (svptest_any(svptrue_b64(), pn)); | } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | #else | ||||
| pn = svwhilelt_b32(js, n); | |||||
| pn = svwhilelt_b32(js, N); | |||||
| n_active = svcntp_b32(svptrue_b32(), pn); | n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| } while (svptest_any(svptrue_b32(), pn)); | } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | #endif | ||||
| @@ -58,8 +58,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||||
| svbool_t pn = svwhilelt_b64(js, n); | svbool_t pn = svwhilelt_b64(js, n); | ||||
| int n_active = svcntp_b64(svptrue_b64(), pn); | int n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| #else | #else | ||||
| int32_t N = n; | |||||
| int32_t js = 0; | int32_t js = 0; | ||||
| svbool_t pn = svwhilelt_b32(js, n); | |||||
| svbool_t pn = svwhilelt_b32(js, N); | |||||
| int n_active = svcntp_b32(svptrue_b32(), pn); | int n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| #endif | #endif | ||||
| do { | do { | ||||
| @@ -83,7 +84,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||||
| ii += n_active; | ii += n_active; | ||||
| } else { | } else { | ||||
| if (ii > jj) { | if (ii > jj) { | ||||
| #ifdef DOUBLE | |||||
| svfloat64_t aj_vec = svld1(pn, ao); | svfloat64_t aj_vec = svld1(pn, ao); | ||||
| #else | |||||
| svfloat32_t aj_vec = svld1(pn, ao); | |||||
| #endif | |||||
| svst1(pn, b, aj_vec); | svst1(pn, b, aj_vec); | ||||
| } | } | ||||
| ao += lda; | ao += lda; | ||||
| @@ -103,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||||
| n_active = svcntp_b64(svptrue_b64(), pn); | n_active = svcntp_b64(svptrue_b64(), pn); | ||||
| } while (svptest_any(svptrue_b64(), pn)); | } while (svptest_any(svptrue_b64(), pn)); | ||||
| #else | #else | ||||
| pn = svwhilelt_b32(js, n); | |||||
| pn = svwhilelt_b32(js, N); | |||||
| n_active = svcntp_b32(svptrue_b32(), pn); | n_active = svcntp_b32(svptrue_b32(), pn); | ||||
| } while (svptest_any(svptrue_b32(), pn)); | } while (svptest_any(svptrue_b32(), pn)); | ||||
| #endif | #endif | ||||
| @@ -0,0 +1,119 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| #include "arm_sve.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||||
| BLASLONG i, ii, jj; | |||||
| FLOAT *ao; | |||||
| lda *= 2; | |||||
| jj = offset; | |||||
| #ifdef DOUBLE | |||||
| int64_t js = 0; | |||||
| svint64_t index = svindex_s64(0LL, lda); | |||||
| svbool_t pn = svwhilelt_b64(js, n); | |||||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||||
| #else | |||||
| int32_t N = n; | |||||
| int32_t js = 0; | |||||
| svint32_t index = svindex_s32(0, lda); | |||||
| svbool_t pn = svwhilelt_b32(js, N); | |||||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||||
| #endif | |||||
| do { | |||||
| ao = a; | |||||
| i = 0; | |||||
| ii = 0; | |||||
| do { | |||||
| if (ii == jj) { | |||||
| for (int j = 0; j < n_active; j++) { | |||||
| for (int k = 0; k < j; k++) { | |||||
| *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); | |||||
| *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); | |||||
| } | |||||
| compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||||
| //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||||
| } | |||||
| ao += n_active * 2; | |||||
| b += n_active * n_active * 2; | |||||
| i += n_active; | |||||
| ii += n_active; | |||||
| } else { | |||||
| if (ii > jj) { | |||||
| #ifdef DOUBLE | |||||
| svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||||
| svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||||
| #else | |||||
| svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||||
| svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||||
| #endif | |||||
| svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | |||||
| } | |||||
| ao += 2; | |||||
| b += n_active * 2; | |||||
| i++; | |||||
| ii++; | |||||
| } | |||||
| } while (i < m); | |||||
| a += n_active * lda; | |||||
| jj += n_active; | |||||
| js += n_active; | |||||
| #ifdef DOUBLE | |||||
| pn = svwhilelt_b64(js, n); | |||||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||||
| } while (svptest_any(svptrue_b64(), pn)); | |||||
| #else | |||||
| pn = svwhilelt_b32(js, N); | |||||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||||
| } while (svptest_any(svptrue_b32(), pn)); | |||||
| #endif | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,115 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| #include "arm_sve.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||||
| BLASLONG i, ii, jj; | |||||
| FLOAT *ao; | |||||
| lda *= 2; | |||||
| jj = offset; | |||||
| #ifdef DOUBLE | |||||
| int64_t js = 0; | |||||
| svbool_t pn = svwhilelt_b64(js, n); | |||||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||||
| #else | |||||
| int32_t N = n; | |||||
| int32_t js = 0; | |||||
| svbool_t pn = svwhilelt_b32(js, N); | |||||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||||
| #endif | |||||
| do { | |||||
| ao = a; | |||||
| i = 0; | |||||
| ii = 0; | |||||
| do { | |||||
| if (ii == jj) { | |||||
| for (int j = 0; j < n_active; j++) { | |||||
| compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||||
| //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||||
| for (int k = j+1; k < n_active; k++) { | |||||
| *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); | |||||
| *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); | |||||
| } | |||||
| } | |||||
| b += n_active * n_active * 2; | |||||
| ao += lda * n_active * 2; | |||||
| i += n_active; | |||||
| ii += n_active; | |||||
| } else { | |||||
| if (ii < jj) { | |||||
| #ifdef DOUBLE | |||||
| svfloat64x2_t aj_vec = svld2(pn, ao); | |||||
| #else | |||||
| svfloat32x2_t aj_vec = svld2(pn, ao); | |||||
| #endif | |||||
| svst2(pn, b, aj_vec); | |||||
| } | |||||
| ao += lda; | |||||
| b += n_active * 2; | |||||
| i ++; | |||||
| ii ++; | |||||
| } | |||||
| } while (i < m); | |||||
| a += n_active * 2; | |||||
| jj += n_active; | |||||
| js += n_active; | |||||
| #ifdef DOUBLE | |||||
| pn = svwhilelt_b64(js, n); | |||||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||||
| } while (svptest_any(svptrue_b64(), pn)); | |||||
| #else | |||||
| pn = svwhilelt_b32(js, N); | |||||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||||
| } while (svptest_any(svptrue_b32(), pn)); | |||||
| #endif | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,119 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| #include "arm_sve.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||||
| BLASLONG i, ii, jj; | |||||
| FLOAT *ao; | |||||
| lda *= 2; | |||||
| jj = offset; | |||||
| #ifdef DOUBLE | |||||
| int64_t js = 0; | |||||
| svint64_t index = svindex_s64(0LL, lda); | |||||
| svbool_t pn = svwhilelt_b64(js, n); | |||||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||||
| #else | |||||
| int32_t N = n; | |||||
| int32_t js = 0; | |||||
| svint32_t index = svindex_s32(0, lda); | |||||
| svbool_t pn = svwhilelt_b32(js, N); | |||||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||||
| #endif | |||||
| do { | |||||
| ao = a; | |||||
| i = 0; | |||||
| ii = 0; | |||||
| do { | |||||
| if (ii == jj) { | |||||
| for (int j = 0; j < n_active; j++) { | |||||
| compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||||
| //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||||
| for (int k = j+1; k < n_active; k++) { | |||||
| *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); | |||||
| *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); | |||||
| } | |||||
| } | |||||
| ao += n_active * 2; | |||||
| b += n_active * n_active * 2; | |||||
| i += n_active; | |||||
| ii += n_active; | |||||
| } else { | |||||
| if (ii < jj) { | |||||
| #ifdef DOUBLE | |||||
| svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||||
| svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||||
| #else | |||||
| svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||||
| svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||||
| #endif | |||||
| svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | |||||
| } | |||||
| ao += 2; | |||||
| b += n_active * 2; | |||||
| i++; | |||||
| ii++; | |||||
| } | |||||
| } while (i < m); | |||||
| a += n_active * lda; | |||||
| jj += n_active; | |||||
| js += n_active; | |||||
| #ifdef DOUBLE | |||||
| pn = svwhilelt_b64(js, n); | |||||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||||
| } while (svptest_any(svptrue_b64(), pn)); | |||||
| #else | |||||
| pn = svwhilelt_b32(js, N); | |||||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||||
| } while (svptest_any(svptrue_b32(), pn)); | |||||
| #endif | |||||
| return 0; | |||||
| } | |||||
| @@ -0,0 +1,115 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #include <stdio.h> | |||||
| #include "common.h" | |||||
| #include "arm_sve.h" | |||||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||||
| BLASLONG i, ii, jj; | |||||
| FLOAT *ao; | |||||
| lda *= 2; | |||||
| jj = offset; | |||||
| #ifdef DOUBLE | |||||
| int64_t js = 0; | |||||
| svbool_t pn = svwhilelt_b64(js, n); | |||||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||||
| #else | |||||
| int32_t N = n; | |||||
| int32_t js = 0; | |||||
| svbool_t pn = svwhilelt_b32(js, N); | |||||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||||
| #endif | |||||
| do { | |||||
| ao = a; | |||||
| i = 0; | |||||
| ii = 0; | |||||
| do { | |||||
| if (ii == jj) { | |||||
| for (int j = 0; j < n_active; j++) { | |||||
| for (int k = 0; k < j; k++) { | |||||
| *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); | |||||
| *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); | |||||
| } | |||||
| compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||||
| //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||||
| } | |||||
| ao += lda * n_active * 2; | |||||
| b += n_active * n_active * 2; | |||||
| i += n_active; | |||||
| ii += n_active; | |||||
| } else { | |||||
| if (ii > jj) { | |||||
| #ifdef DOUBLE | |||||
| svfloat64x2_t aj_vec = svld2(pn, ao); | |||||
| #else | |||||
| svfloat32x2_t aj_vec = svld2(pn, ao); | |||||
| #endif | |||||
| svst2(pn, b, aj_vec); | |||||
| } | |||||
| ao += lda; | |||||
| b += n_active * 2; | |||||
| i ++; | |||||
| ii ++; | |||||
| } | |||||
| } while (i < m); | |||||
| a += n_active * 2; | |||||
| jj += n_active; | |||||
| js += n_active; | |||||
| #ifdef DOUBLE | |||||
| pn = svwhilelt_b64(js, n); | |||||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||||
| } while (svptest_any(svptrue_b64(), pn)); | |||||
| #else | |||||
| pn = svwhilelt_b32(js, N); | |||||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||||
| } while (svptest_any(svptrue_b32(), pn)); | |||||
| #endif | |||||
| return 0; | |||||
| } | |||||