| @@ -229,3 +229,6 @@ In chronological order: | |||
| * Christopher Daley <https://github.com/cdaley> | |||
| * [2024-01-24] Optimize GEMV forwarding on ARM64 systems | |||
| * Aymen Qader <aymen.qader@arm.com> | |||
| * [2024-12-09] Add Arm®v9-A architecture SME2 SGEMM kernels | |||
| @@ -30,6 +30,11 @@ FCOMMON_OPT += -march=armv8-a+sve | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), ARMV9SME) | |||
| CCOMMON_OPT += -march=armv9-a+sme2 -O3 | |||
| FCOMMON_OPT += -march=armv9-a+sve2 -O3 | |||
| endif | |||
| ifeq ($(CORE), CORTEXA53) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| @@ -420,6 +420,7 @@ ifeq ($(ARCH), arm64) | |||
| export MACOSX_DEPLOYMENT_TARGET=11.0 | |||
| ifeq ($(C_COMPILER), GCC) | |||
| export NO_SVE = 1 | |||
| export NO_SME = 1 | |||
| endif | |||
| else | |||
| export MACOSX_DEPLOYMENT_TARGET=10.8 | |||
| @@ -709,6 +710,11 @@ DYNAMIC_CORE += NEOVERSEN2 | |||
| DYNAMIC_CORE += ARMV8SVE | |||
| DYNAMIC_CORE += A64FX | |||
| endif | |||
| # Disabled by default while ARMV9SME is WIP | |||
| NO_SME ?= 1 | |||
| ifneq ($(NO_SME), 1) | |||
| DYNAMIC_CORE += ARMV9SME | |||
| endif | |||
| DYNAMIC_CORE += THUNDERX | |||
| DYNAMIC_CORE += THUNDERX2T99 | |||
| DYNAMIC_CORE += TSV110 | |||
| @@ -1474,6 +1480,10 @@ ifeq ($(NO_SVE), 1) | |||
| CCOMMON_OPT += -DNO_SVE | |||
| endif | |||
| ifeq ($(NO_SME), 1) | |||
| CCOMMON_OPT += -DNO_SME | |||
| endif | |||
| ifdef SMP | |||
| CCOMMON_OPT += -DSMP_SERVER | |||
| @@ -188,6 +188,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||
| - **Apple Vortex**: preliminary support based on ThunderX2/3 | |||
| - **A64FX**: preliminary support, optimized Level-3 BLAS | |||
| - **ARMV8SVE**: any ARMV8 cpu with SVE extensions | |||
| - **ARMV9SME**: WIP target, any Arm®v9-A core with SME2 support. Only functional for GEMM. | |||
| #### PPC/PPC64 | |||
| @@ -111,6 +111,7 @@ THUNDERX3T110 | |||
| VORTEX | |||
| A64FX | |||
| ARMV8SVE | |||
| ARMV9SME | |||
| FT2000 | |||
| 9.System Z: | |||
| @@ -44,9 +44,21 @@ endif () | |||
| if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 10) # SVE ACLE supported in GCC >= 10 | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | |||
| endif () | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 14) # SME ACLE supported in GCC >= 14 | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) | |||
| endif() | |||
| elseif (${CMAKE_C_COMPILER_ID} MATCHES "Clang") | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 11) # SVE ACLE supported in LLVM >= 11 | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | |||
| endif () | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER_EQUAL 19) # SME ACLE supported in LLVM >= 19 | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} ARMV9SME) | |||
| endif() | |||
| endif () | |||
| if (DYNAMIC_LIST) | |||
| set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | |||
| @@ -238,6 +238,12 @@ if (${CORE} STREQUAL ARMV8SVE) | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL ARMV9SME) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv9-a+sme2") | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL CORTEXA510) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| @@ -1014,7 +1014,7 @@ endif () | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN2") | |||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN2" or "${TCORE}" STREQUAL "ARMV9SME") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t65536\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| @@ -310,6 +310,9 @@ if (${TARGET} STREQUAL NEOVERSEV1) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL ARMV9SME) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv9-a+sme2 -O3") | |||
| endif() | |||
| if (${TARGET} STREQUAL A64FX) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx") | |||
| @@ -175,7 +175,7 @@ REALNAME: | |||
| #define HUGE_PAGESIZE ( 4 << 20) | |||
| #ifndef BUFFERSIZE | |||
| #if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) | |||
| #if defined(NEOVERSEN1) || defined(NEOVERSEN2) || defined(NEOVERSEV1) || defined(A64FX) || defined(ARMV8SVE) || defined(ARMV9SME) | |||
| #define BUFFER_SIZE (32 << 22) | |||
| #else | |||
| #define BUFFER_SIZE (32 << 20) | |||
| @@ -345,6 +345,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #elif defined(ARMV9SME) && !defined(DOUBLE) && !defined(COMPLEX) | |||
| /* the current SME SGEMM kernel requires n>=8*GEMM_UNROLL_N to achieve best performance */ | |||
| if (min_jj >= 8*GEMM_UNROLL_N) min_jj = 8*GEMM_UNROLL_N; | |||
| #else | |||
| if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | |||
| else | |||
| @@ -115,6 +115,11 @@ extern gotoblas_t gotoblas_ARMV8SVE; | |||
| #else | |||
| #define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_ARMV9SME | |||
| extern gotoblas_t gotoblas_ARMV9SME; | |||
| #else | |||
| #define gotoblas_ARMV9SME gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEX_A55 | |||
| extern gotoblas_t gotoblas_CORTEXA55; | |||
| #else | |||
| @@ -148,6 +153,13 @@ extern gotoblas_t gotoblas_A64FX; | |||
| #define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
| #define gotoblas_A64FX gotoblas_ARMV8 | |||
| #endif | |||
| #ifndef NO_SME | |||
| extern gotoblas_t gotoblas_ARMV9SME; | |||
| #else | |||
| #define gotoblas_ARMV9SME gotoblas_ARMV8SVE | |||
| #endif | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| #endif | |||
| #define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1 | |||
| @@ -393,6 +405,13 @@ static gotoblas_t *get_coretype(void) { | |||
| snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); | |||
| openblas_warning(1, coremsg); | |||
| } | |||
| #if !defined(NO_SME) && defined(HWCAP2_SME2) | |||
| if ((getauxval(AT_HWCAP2) & HWCAP2_SME2)) { | |||
| return &gotoblas_ARMV9SME; | |||
| } | |||
| #endif | |||
| #ifndef NO_SVE | |||
| if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { | |||
| return &gotoblas_ARMV8SVE; | |||
| @@ -1289,6 +1289,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "ARMV8SVE" | |||
| #endif | |||
| #ifdef FORCE_ARMV9SME | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "ARMV9SME" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DARMV9SME " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DHAVE_SME -DARMV8 -DARMV9" | |||
| #define LIBNAME "armv9sme" | |||
| #define CORENAME "ARMV9SME" | |||
| #endif | |||
| #ifdef FORCE_ARMV8 | |||
| #define FORCE | |||
| @@ -0,0 +1,8 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
| SGEMMKERNEL = sgemm_kernel_sme.c | |||
| SGEMMINCOPY = sgemm_ncopy_sme.c | |||
| SGEMMITCOPY = sgemm_tcopy_sme.c | |||
| SGEMMONCOPY = sgemm_ncopy_sme.c | |||
| SGEMMOTCOPY = sgemm_tcopy_sme.c | |||
| @@ -0,0 +1,188 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <arm_sme.h> | |||
| #include "common.h" | |||
| #include "sme_abi.h" | |||
| // Outer product kernel. | |||
| // Computes a 2SVL x 2SVL block of C, utilizing all four FP32 tiles of ZA. | |||
| // This kernel is unpredicated, and assumes a full 2SVL x 2SVL block. | |||
| __attribute__((always_inline)) inline void | |||
| kernel_2x2(const float *A, const float *B, float *C, float alpha, | |||
| size_t shared_dim, size_t a_step, size_t b_step, size_t c_step) | |||
| __arm_out("za") __arm_streaming { | |||
| const size_t svl = svcntw(); | |||
| // Predicate set-up | |||
| svbool_t ptrue = svptrue_b32(); | |||
| // Load from C into ZA | |||
| for (size_t i = 0; i < (svl >> 1); i++) { | |||
| svld1_ver_za32(0, i, ptrue, &C[0 * svl + i * c_step]); | |||
| svld1_ver_za32(1, i, ptrue, &C[1 * svl + i * c_step]); | |||
| svld1_ver_za32(2, i, ptrue, &C[0 * svl + (i + svl) * c_step]); | |||
| svld1_ver_za32(3, i, ptrue, &C[1 * svl + (i + svl) * c_step]); | |||
| } | |||
| svfloat32_t alpha_vec = svdup_f32(alpha); | |||
| // Iterate through shared dimension (K) | |||
| for (size_t k = 0; k < shared_dim; k++) { | |||
| // Load column of A | |||
| svfloat32x2_t cols_a = svld1_x2(svptrue_c32(), &A[k * a_step]); | |||
| // Load row of B | |||
| svfloat32x2_t rows_b = svld1_x2(svptrue_c32(), &B[k * b_step]); | |||
| // Multiply B through by alpha | |||
| svfloat32_t row_b_0 = svmul_x(ptrue, alpha_vec, svget2(rows_b, 0)); | |||
| svfloat32_t row_b_1 = svmul_x(ptrue, alpha_vec, svget2(rows_b, 1)); | |||
| // Perform outer products | |||
| svmopa_za32_m(0, ptrue, ptrue, svget2(cols_a, 0), row_b_0); | |||
| svmopa_za32_m(1, ptrue, ptrue, svget2(cols_a, 1), row_b_0); | |||
| svmopa_za32_m(2, ptrue, ptrue, svget2(cols_a, 0), row_b_1); | |||
| svmopa_za32_m(3, ptrue, ptrue, svget2(cols_a, 1), row_b_1); | |||
| } | |||
| // Store out to C from ZA | |||
| for (size_t i = 0; i < (svl >> 1); i++) { | |||
| // Store out one row of C per tile | |||
| svst1_ver_za32(0, i, ptrue, &C[0 * svl + i * c_step]); | |||
| svst1_ver_za32(1, i, ptrue, &C[1 * svl + i * c_step]); | |||
| svst1_ver_za32(2, i, ptrue, &C[0 * svl + (i + svl) * c_step]); | |||
| svst1_ver_za32(3, i, ptrue, &C[1 * svl + (i + svl) * c_step]); | |||
| } | |||
| } | |||
| // Outer product kernel. | |||
| // Computes an SVL x SVL block of C, utilizing a single FP32 tile of ZA (ZA0). | |||
| // This kernel is predicated, and can handle under-filled blocks. | |||
| __attribute__((always_inline)) inline void | |||
| kernel_1x1(const float *A, const float *B, float *C, float alpha, | |||
| size_t shared_dim, size_t a_len, size_t a_step, size_t b_len, | |||
| size_t b_step, size_t c_step, size_t c_rows, size_t c_cols) | |||
| __arm_out("za") __arm_streaming { | |||
| // Predicate set-up | |||
| svbool_t pg = svptrue_b32(); | |||
| svbool_t pg_a = svwhilelt_b32_u64(0, a_len); | |||
| svbool_t pg_b = svwhilelt_b32_u64(0, b_len); | |||
| svbool_t pg_c = svwhilelt_b32_u64(0, c_rows); | |||
| // Load from C into ZA | |||
| for (size_t i = 0; i < c_cols; i++) { | |||
| svld1_ver_za32(0, i, pg_c, &C[i * c_step]); | |||
| } | |||
| svfloat32_t alpha_vec = svdup_f32_z(pg_b, alpha); | |||
| // Iterate through shared dimension (K) | |||
| for (size_t k = 0; k < shared_dim; k++) { | |||
| // Load column of A | |||
| svfloat32_t col_a = svld1(pg_a, &A[k * a_step]); | |||
| // Load row of B | |||
| svfloat32_t row_b = svld1(pg_b, &B[k * b_step]); | |||
| // Multiply B through by alpha | |||
| row_b = svmul_x(pg_b, alpha_vec, row_b); | |||
| // Perform outer product | |||
| svmopa_za32_m(0, pg, pg, col_a, row_b); | |||
| } | |||
| // Store out to C from ZA | |||
| for (size_t i = 0; i < c_cols; i++) { | |||
| svst1_ver_za32(0, i, pg_c, &C[i * c_step]); | |||
| } | |||
| } | |||
| __arm_new("za") __arm_locally_streaming | |||
| int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha0, FLOAT *ba, | |||
| FLOAT *bb, FLOAT *C, BLASLONG ldc) { | |||
| const BLASLONG num_rows = bm; | |||
| const BLASLONG num_cols = bn; | |||
| const FLOAT *a_ptr = ba; | |||
| const FLOAT *b_ptr = bb; | |||
| FLOAT *c_ptr = C; | |||
| const BLASLONG svl = svcntw(); | |||
| const BLASLONG a_step = bm; | |||
| const BLASLONG b_step = bn; | |||
| const BLASLONG c_step = ldc; | |||
| // Block over rows of C (panels of A) | |||
| BLASLONG row_idx = 0; | |||
| // 2x2 loop | |||
| BLASLONG row_batch = 2 * svl; | |||
| // Block over row dimension of C | |||
| for (; row_idx + row_batch <= num_rows; row_idx += row_batch) { | |||
| BLASLONG col_idx = 0; | |||
| BLASLONG col_batch = 2 * svl; | |||
| // Block over column dimension of C | |||
| for (; col_idx + col_batch <= num_cols; col_idx += col_batch) { | |||
| kernel_2x2(&a_ptr[row_idx], &b_ptr[col_idx], | |||
| &c_ptr[row_idx + col_idx * c_step], alpha0, bk, a_step, b_step, | |||
| c_step); | |||
| } | |||
| // Handle under-filled blocks w/ 2x(1x1) kernels | |||
| col_batch = 1 * svl; | |||
| for (; col_idx < num_cols; col_idx += col_batch) { | |||
| col_batch = MIN(col_batch, num_cols - col_idx); | |||
| kernel_1x1(&a_ptr[row_idx], &b_ptr[col_idx], | |||
| &c_ptr[row_idx + col_idx * c_step], alpha0, bk, svl, a_step, | |||
| col_batch, b_step, c_step, svl, col_batch); | |||
| kernel_1x1(&a_ptr[row_idx + svl], &b_ptr[col_idx], | |||
| &c_ptr[(row_idx + svl) + col_idx * c_step], alpha0, bk, svl, | |||
| a_step, col_batch, b_step, c_step, svl, col_batch); | |||
| } | |||
| } | |||
| // Handle under-filled blocks w/ 1x1 kernels | |||
| row_batch = 1 * svl; | |||
| for (; row_idx < num_rows; row_idx += row_batch) { | |||
| row_batch = MIN(row_batch, num_rows - row_idx); | |||
| // Block over column dimension of C | |||
| BLASLONG col_batch = svl; | |||
| for (BLASLONG col_idx = 0; col_idx < num_cols; col_idx += col_batch) { | |||
| col_batch = MIN(col_batch, num_cols - col_idx); | |||
| kernel_1x1(&a_ptr[row_idx], &b_ptr[col_idx], | |||
| &c_ptr[row_idx + col_idx * c_step], alpha0, bk, row_batch, | |||
| a_step, col_batch, b_step, c_step, row_batch, col_batch); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,64 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <arm_sme.h> | |||
| #include "common.h" | |||
| #include "sme_abi.h" | |||
| // Transpose 1SVL x N panel of A into B | |||
| __attribute__((always_inline)) inline static void | |||
| transpose_panel(const FLOAT *a, FLOAT *b, BLASLONG rows, BLASLONG cols, | |||
| BLASLONG a_step, BLASLONG b_step) | |||
| __arm_out("za") __arm_streaming { | |||
| BLASLONG col_batch = svcntsw(); | |||
| const svbool_t pg_a = svwhilelt_b32_u64(0, rows); | |||
| for (BLASLONG k = 0; k < cols; k += col_batch) { | |||
| col_batch = MIN(col_batch, cols - k); | |||
| for (BLASLONG col = 0; col < col_batch; col++) { | |||
| svld1_ver_za32(0, col, pg_a, &a[(col + k) * a_step]); | |||
| } | |||
| const svbool_t pg_b = svwhilelt_b32_u64(k, cols); | |||
| for (BLASLONG row = 0; row < rows; row++) { | |||
| svst1_hor_za32(0, row, pg_b, &b[row * b_step + k]); | |||
| } | |||
| } | |||
| } | |||
| __arm_new("za") __arm_locally_streaming | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) { | |||
| const BLASLONG num_rows = m; | |||
| BLASLONG row_batch = svcntsw(); | |||
| for (BLASLONG row_idx = 0; row_idx < num_rows; row_idx += row_batch) { | |||
| // Transpose 1xSVL panel | |||
| row_batch = MIN(row_batch, num_rows - row_idx); | |||
| transpose_panel(&a[row_idx], &b[row_idx * n], row_batch, n, lda, n); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,42 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #include "sme_abi.h" | |||
| __arm_locally_streaming int CNAME(BLASLONG m, BLASLONG n, FLOAT *restrict a, | |||
| BLASLONG lda, FLOAT *restrict b) { | |||
| for (BLASLONG i = 0; i < m; i++) { | |||
| for (BLASLONG j = 0; j < n; j += svcntw()) { | |||
| svbool_t pg = svwhilelt_b32_u64(j, n); | |||
| svst1(pg, &b[i * n + j], svld1(pg, &a[i * lda + j])); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,45 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #pragma once | |||
| #include <stdlib.h> | |||
| /** | |||
| * These are SME ABI routines for saving & restoring SME state. | |||
| * They are typically provided by a compiler runtime library such | |||
| * as libgcc or compiler-rt, but support for these routines is not | |||
| * yet available on all platforms. | |||
| * | |||
| * Define these as aborting stubs so that we loudly fail on nested | |||
| * usage of SME state. | |||
| * | |||
| * These are defined as weak symbols so that a compiler runtime can | |||
| * override them if supported. | |||
| */ | |||
| __attribute__((weak)) void __arm_tpidr2_save() { abort(); } | |||
| __attribute__((weak)) void __arm_tpidr2_restore() { abort(); } | |||
| @@ -3667,7 +3667,7 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE | |||
| #elif defined(ARMV8SVE) || defined(ARMV9SME) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE | |||
| #if defined(XDOUBLE) || defined(DOUBLE) | |||
| #define SWITCH_RATIO 8 | |||