Add ARMV8SVE to AArch64 Dynamic Dispatchtags/v0.3.24
| @@ -668,6 +668,7 @@ DYNAMIC_CORE += NEOVERSEN1 | |||
| ifneq ($(NO_SVE), 1) | |||
| DYNAMIC_CORE += NEOVERSEV1 | |||
| DYNAMIC_CORE += NEOVERSEN2 | |||
| DYNAMIC_CORE += ARMV8SVE | |||
| endif | |||
| DYNAMIC_CORE += CORTEXA55 | |||
| DYNAMIC_CORE += FALKOR | |||
| @@ -46,7 +46,7 @@ if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) | |||
| endif () | |||
| if (DYNAMIC_LIST) | |||
| set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2; | |||
| #else | |||
| #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_ARMV8SVE | |||
| extern gotoblas_t gotoblas_ARMV8SVE; | |||
| #else | |||
| #define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEX_A55 | |||
| extern gotoblas_t gotoblas_CORTEXA55; | |||
| #else | |||
| @@ -128,9 +134,11 @@ extern gotoblas_t gotoblas_NEOVERSEN1; | |||
| #ifndef NO_SVE | |||
| extern gotoblas_t gotoblas_NEOVERSEV1; | |||
| extern gotoblas_t gotoblas_NEOVERSEN2; | |||
| extern gotoblas_t gotoblas_ARMV8SVE; | |||
| #else | |||
| #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 | |||
| #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | |||
| #define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
| #endif | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| extern gotoblas_t gotoblas_CORTEXA55; | |||
| @@ -140,7 +148,7 @@ extern void openblas_warning(int verbose, const char * msg); | |||
| #define FALLBACK_VERBOSE 1 | |||
| #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" | |||
| #define NUM_CORETYPES 13 | |||
| #define NUM_CORETYPES 16 | |||
| /* | |||
| * In case asm/hwcap.h is outdated on the build system, make sure | |||
| @@ -173,6 +181,7 @@ static char *corename[] = { | |||
| "neoversen2", | |||
| "thunderx3t110", | |||
| "cortexa55", | |||
| "armv8sve", | |||
| "unknown" | |||
| }; | |||
| @@ -192,6 +201,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; | |||
| if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; | |||
| if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; | |||
| if (gotoblas == &gotoblas_ARMV8SVE) return corename[15]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| @@ -226,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||
| case 12: return (&gotoblas_NEOVERSEN2); | |||
| case 13: return (&gotoblas_THUNDERX3T110); | |||
| case 14: return (&gotoblas_CORTEXA55); | |||
| case 15: return (&gotoblas_ARMV8SVE); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| @@ -345,6 +356,12 @@ static gotoblas_t *get_coretype(void) { | |||
| snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); | |||
| openblas_warning(1, coremsg); | |||
| } | |||
| #ifndef NO_SVE | |||
| if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { | |||
| return &gotoblas_ARMV8SVE; | |||
| } | |||
| #endif | |||
| return NULL; | |||
| #endif | |||
| } | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, n); | |||
| svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
| uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| do { | |||
| @@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| aoffset += active * lda * 2; | |||
| j += svcntw(); | |||
| pg = svwhilelt_b32(j, n); | |||
| pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, n); | |||
| svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
| uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| do { | |||
| @@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| aoffset += active * 2; | |||
| j += svcntw(); | |||
| pg = svwhilelt_b32(j, n); | |||
| pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| @@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| @@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| @@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| @@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -56,13 +57,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| @@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| aoffset += active * lda * 2; | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| @@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| aoffset += active * 2; | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| @@ -79,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | |||
| if (offset <= 0) { | |||
| svbool_t off_g = svwhilelt_b64(offset, 0LL); | |||
| svbool_t off_g = svwhilelt_b64((uint64_t)offset, (uint64_t)0LL); | |||
| data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
| } | |||
| @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| @@ -117,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| int32_t j = 0; | |||
| int32_t N = n; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| @@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | |||
| if (offset <= 0) { | |||
| svbool_t off_g = svwhilelt_b32(offset, 0); | |||
| svbool_t off_g = svwhilelt_b32((uint32_t)offset, (uint32_t)0); | |||
| data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
| } | |||
| @@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| @@ -80,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
| data_vec_imag = svneg_z(pg, data_vec_imag); | |||
| if (offset <= 0) { | |||
| svbool_t off_g = svwhilelt_b64(offset, 0LL); | |||
| svbool_t off_g = svwhilelt_b64((uint64_t)offset, (uint64_t)0LL); | |||
| data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
| } | |||
| @@ -100,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| #else | |||
| @@ -116,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| int32_t j = 0; | |||
| int32_t N = n; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| @@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
| data_vec_imag = svneg_z(pg, data_vec_imag); | |||
| if (offset <= 0) { | |||
| svbool_t off_g = svwhilelt_b32(offset, 0); | |||
| svbool_t off_g = svwhilelt_b32((uint32_t)offset, (uint32_t)0); | |||
| data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
| } | |||
| @@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| @@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| @@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| @@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| @@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| @@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| @@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| #ifdef DOUBLE | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| @@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| js = 0; | |||
| #ifdef DOUBLE | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| @@ -129,11 +130,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| #ifdef DOUBLE | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| @@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| js = 0; | |||
| #ifdef DOUBLE | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| @@ -128,11 +129,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -3371,7 +3371,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #elif defined(NEOVERSEV1) | |||
| #elif defined(NEOVERSEV1) // 256-bit SVE | |||
| #if defined(XDOUBLE) || defined(DOUBLE) | |||
| #define SWITCH_RATIO 8 | |||
| @@ -3449,7 +3449,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) | |||
| #elif defined(A64FX) // 512-bit SVE | |||
| /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". | |||
| Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ | |||
| @@ -3490,6 +3490,43 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE | |||
| #if defined(XDOUBLE) || defined(DOUBLE) | |||
| #define SWITCH_RATIO 8 | |||
| #else | |||
| #define SWITCH_RATIO 16 | |||
| #endif | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 // Actually 1VL (8) but kept seperate to keep copies seperate | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_MN 16 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_MN 16 | |||
| #define SGEMM_DEFAULT_P 128 | |||
| #define DGEMM_DEFAULT_P 160 | |||
| #define CGEMM_DEFAULT_P 128 | |||
| #define ZGEMM_DEFAULT_P 128 | |||
| #define SGEMM_DEFAULT_Q 352 | |||
| #define DGEMM_DEFAULT_Q 128 | |||
| #define CGEMM_DEFAULT_Q 224 | |||
| #define ZGEMM_DEFAULT_Q 112 | |||
| #define SGEMM_DEFAULT_R 4096 | |||
| #define DGEMM_DEFAULT_R 4096 | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #else /* Other/undetected ARMv8 cores */ | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||