| @@ -20,25 +20,36 @@ IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| STRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| STRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| STRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| STRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| DTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| TRSMCOPYLN_M = trsm_lncopy_sve.c | |||
| TRSMCOPYLT_M = trsm_ltcopy_sve.c | |||
| TRSMCOPYUN_M = trsm_uncopy_sve.c | |||
| TRSMCOPYUT_M = trsm_utcopy_sve.c | |||
| CTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| CTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| CTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| CTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c | |||
| ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c | |||
| ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c | |||
| ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| @@ -167,7 +167,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
| BLASLONG i, j; | |||
| FLOAT *aa, *cc; | |||
| BLASLONG kk; | |||
| #ifdef DOUBLE | |||
| int sve_size = svcntd(); | |||
| #else | |||
| int sve_size = svcntw(); | |||
| #endif | |||
| #if 0 | |||
| fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", | |||
| @@ -157,7 +157,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
| FLOAT *aa, *cc; | |||
| BLASLONG kk; | |||
| BLASLONG i, j, jj; | |||
| #ifdef DOUBLE | |||
| int sve_size = svcntd(); | |||
| #else | |||
| int sve_size = svcntw(); | |||
| #endif | |||
| #if 0 | |||
| fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", | |||
| @@ -157,7 +157,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
| FLOAT *aa, *cc; | |||
| BLASLONG kk; | |||
| BLASLONG i, j, jj; | |||
| #ifdef DOUBLE | |||
| int sve_size = svcntd(); | |||
| #else | |||
| int sve_size = svcntw(); | |||
| #endif | |||
| #if 0 | |||
| fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", | |||
| @@ -169,7 +169,11 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
| BLASLONG i, j; | |||
| FLOAT *aa, *cc; | |||
| BLASLONG kk; | |||
| #ifdef DOUBLE | |||
| int sve_size = svcntd(); | |||
| #else | |||
| int sve_size = svcntw(); | |||
| #endif | |||
| #if 0 | |||
| fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", | |||
| @@ -59,9 +59,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -85,7 +86,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| ii += n_active; | |||
| } else { | |||
| if (ii > jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #else | |||
| svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| } | |||
| ao++; | |||
| @@ -105,7 +110,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -58,8 +58,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -83,7 +84,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| ii += n_active; | |||
| } else { | |||
| if (ii < jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1(pn, ao); | |||
| #else | |||
| svfloat32_t aj_vec = svld1(pn, ao); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| } | |||
| ao += lda; | |||
| @@ -103,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -59,9 +59,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -85,7 +86,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| ii += n_active; | |||
| } else { | |||
| if (ii < jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #else | |||
| svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| } | |||
| ao++; | |||
| @@ -105,7 +110,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -58,8 +58,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -83,7 +84,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| ii += n_active; | |||
| } else { | |||
| if (ii > jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1(pn, ao); | |||
| #else | |||
| svfloat32_t aj_vec = svld1(pn, ao); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| } | |||
| ao += lda; | |||
| @@ -103,7 +108,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -0,0 +1,119 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj; | |||
| FLOAT *ao; | |||
| lda *= 2; | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| ao = a; | |||
| i = 0; | |||
| ii = 0; | |||
| do { | |||
| if (ii == jj) { | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0; k < j; k++) { | |||
| *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); | |||
| *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); | |||
| } | |||
| compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||
| //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
| } | |||
| ao += n_active * 2; | |||
| b += n_active * n_active * 2; | |||
| i += n_active; | |||
| ii += n_active; | |||
| } else { | |||
| if (ii > jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
| svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
| #else | |||
| svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
| svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
| #endif | |||
| svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | |||
| } | |||
| ao += 2; | |||
| b += n_active * 2; | |||
| i++; | |||
| ii++; | |||
| } | |||
| } while (i < m); | |||
| a += n_active * lda; | |||
| jj += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,115 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj; | |||
| FLOAT *ao; | |||
| lda *= 2; | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| ao = a; | |||
| i = 0; | |||
| ii = 0; | |||
| do { | |||
| if (ii == jj) { | |||
| for (int j = 0; j < n_active; j++) { | |||
| compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||
| //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
| for (int k = j+1; k < n_active; k++) { | |||
| *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); | |||
| *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); | |||
| } | |||
| } | |||
| b += n_active * n_active * 2; | |||
| ao += lda * n_active * 2; | |||
| i += n_active; | |||
| ii += n_active; | |||
| } else { | |||
| if (ii < jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64x2_t aj_vec = svld2(pn, ao); | |||
| #else | |||
| svfloat32x2_t aj_vec = svld2(pn, ao); | |||
| #endif | |||
| svst2(pn, b, aj_vec); | |||
| } | |||
| ao += lda; | |||
| b += n_active * 2; | |||
| i ++; | |||
| ii ++; | |||
| } | |||
| } while (i < m); | |||
| a += n_active * 2; | |||
| jj += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,119 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj; | |||
| FLOAT *ao; | |||
| lda *= 2; | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| ao = a; | |||
| i = 0; | |||
| ii = 0; | |||
| do { | |||
| if (ii == jj) { | |||
| for (int j = 0; j < n_active; j++) { | |||
| compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||
| //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
| for (int k = j+1; k < n_active; k++) { | |||
| *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); | |||
| *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); | |||
| } | |||
| } | |||
| ao += n_active * 2; | |||
| b += n_active * n_active * 2; | |||
| i += n_active; | |||
| ii += n_active; | |||
| } else { | |||
| if (ii < jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
| svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
| #else | |||
| svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
| svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
| #endif | |||
| svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | |||
| } | |||
| ao += 2; | |||
| b += n_active * 2; | |||
| i++; | |||
| ii++; | |||
| } | |||
| } while (i < m); | |||
| a += n_active * lda; | |||
| jj += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,115 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj; | |||
| FLOAT *ao; | |||
| lda *= 2; | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| ao = a; | |||
| i = 0; | |||
| ii = 0; | |||
| do { | |||
| if (ii == jj) { | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0; k < j; k++) { | |||
| *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); | |||
| *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); | |||
| } | |||
| compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||
| //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
| } | |||
| ao += lda * n_active * 2; | |||
| b += n_active * n_active * 2; | |||
| i += n_active; | |||
| ii += n_active; | |||
| } else { | |||
| if (ii > jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64x2_t aj_vec = svld2(pn, ao); | |||
| #else | |||
| svfloat32x2_t aj_vec = svld2(pn, ao); | |||
| #endif | |||
| svst2(pn, b, aj_vec); | |||
| } | |||
| ao += lda; | |||
| b += n_active * 2; | |||
| i ++; | |||
| ii ++; | |||
| } | |||
| } while (i < m); | |||
| a += n_active * 2; | |||
| jj += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||