| @@ -28,6 +28,9 @@ jobs: | |||
| - target: RISCV64_ZVL256B | |||
| opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64 | |||
| qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64 | |||
| - target: DYNAMIC_ARCH=1 | |||
| opts: TARGET=RISCV64_GENERIC BINARY=64 ARCH=riscv64 DYNAMIC_ARCH=1 | |||
| qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64 | |||
| steps: | |||
| - name: Checkout repository | |||
| @@ -1,7 +1,7 @@ | |||
| pipeline { | |||
| agent { | |||
| docker { | |||
| image 'osuosl/ubuntu-ppc64le' | |||
| image 'osuosl/ubuntu-ppc64le:18.04' | |||
| } | |||
| } | |||
| stages { | |||
| @@ -276,12 +276,19 @@ endif | |||
| endif | |||
| endif | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
| ifeq ($(CORE), A64FX) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(ISCLANG))) | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ3) $(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
| CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-n1 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-n1 | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -268,13 +268,24 @@ SMALL_MATRIX_OPT = 1 | |||
| else ifeq ($(ARCH), power) | |||
| SMALL_MATRIX_OPT = 1 | |||
| BUILD_BFLOAT16 = 1 | |||
| else ifeq ($(ARCH), arm64) | |||
| SMALL_MATRIX_OPT = 1 | |||
| endif | |||
| ifeq ($(ARCH), loongarch64) | |||
| SMALL_MATRIX_OPT = 1 | |||
| endif | |||
| ifeq ($(ARCH), arm64) | |||
| GEMM_GEMV_FORWARD = 1 | |||
| endif | |||
| ifeq ($(SMALL_MATRIX_OPT), 1) | |||
| CCOMMON_OPT += -DSMALL_MATRIX_OPT | |||
| endif | |||
| ifeq ($(GEMM_GEMV_FORWARD), 1) | |||
| ifneq ($(ONLY_CBLAS), 1) | |||
| CCOMMON_OPT += -DGEMM_GEMV_FORWARD | |||
| endif | |||
| endif | |||
| # This operation is expensive, so execution should be once. | |||
| ifndef GOTOBLAS_MAKEFILE | |||
| @@ -689,6 +700,7 @@ ifneq ($(NO_SVE), 1) | |||
| DYNAMIC_CORE += NEOVERSEV1 | |||
| DYNAMIC_CORE += NEOVERSEN2 | |||
| DYNAMIC_CORE += ARMV8SVE | |||
| DYNAMIC_CORE += A64FX | |||
| endif | |||
| DYNAMIC_CORE += THUNDERX | |||
| DYNAMIC_CORE += THUNDERX2T99 | |||
| @@ -715,6 +727,17 @@ ifeq ($(ARCH), loongarch64) | |||
| DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC | |||
| endif | |||
| ifeq ($(ARCH), riscv64) | |||
| DYNAMIC_CORE = RISCV64_GENERIC | |||
| DYNAMIC_CORE += RISCV64_ZVL128B | |||
| DYNAMIC_CORE += RISCV64_ZVL256B | |||
| ifdef DYNAMIC_LIST | |||
| override DYNAMIC_CORE = RISCV64_GENERIC $(DYNAMIC_LIST) | |||
| XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_RISCV64_GENERIC | |||
| XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), zarch) | |||
| DYNAMIC_CORE = ZARCH_GENERIC | |||
| @@ -234,6 +234,8 @@ For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additi | |||
| on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support. | |||
| On **riscv64**, DYNAMIC_ARCH enables support for riscv64_zvl128b and riscv64_zvl256b in addition to generic riscv64 support. A compiler that supports RVV 1.0 is required to build OpenBLAS for riscv64 when DYNAMIC_ARCH is enabled. | |||
| The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the | |||
| common code in the library, usually you will want to set this to the oldest model you expect to encounter. | |||
| Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library. | |||
| @@ -234,14 +234,10 @@ def test_gesdd(benchmark, mn, variant): | |||
| gesdd = ow.get_func('gesdd', variant) | |||
| u, s, vt, info = benchmark(run_gesdd, a, lwork, gesdd) | |||
| if variant != 's': | |||
| # On entry to SLASCL parameter number 4 had an illegal value | |||
| # under codspeed (cannot repro locally or on CI w/o codspeed) | |||
| # https://github.com/OpenMathLib/OpenBLAS/issues/4776 | |||
| assert info == 0 | |||
| atol = {'s': 1e-5, 'd': 1e-13} | |||
| np.testing.assert_allclose(u @ np.diag(s) @ vt, a, atol=atol[variant]) | |||
| assert info == 0 | |||
| atol = {'s': 1e-5, 'd': 1e-13} | |||
| np.testing.assert_allclose(u @ np.diag(s) @ vt, a, atol=atol[variant]) | |||
| # linalg.eigh | |||
| @@ -356,6 +356,9 @@ if [ "$compiler" = "GCC" ]; then | |||
| no_avx2=0 | |||
| oldgcc=0 | |||
| data=`$compiler_name -dumpversion` | |||
| case "$data" in *-*) | |||
| data="${data%-*}" | |||
| esac | |||
| case "$data" in *.*.*) | |||
| data="${data%.*}" | |||
| esac | |||
| @@ -46,7 +46,7 @@ if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE A64FX) | |||
| endif () | |||
| if (DYNAMIC_LIST) | |||
| set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | |||
| @@ -1218,6 +1218,37 @@ endif () | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "A64FX") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t65536\n" | |||
| "#define L1_CODE_LINESIZE\t256\n" | |||
| "#define L1_CODE_ASSOCIATIVE\t8\n" | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t256\n" | |||
| "#define L1_DATA_ASSOCIATIVE\t8\n" | |||
| "#define L2_SIZE\t8388608\n\n" | |||
| "#define L2_LINESIZE\t256\n" | |||
| "#define L2_ASSOCIATIVE\t8\n" | |||
| "#define L3_SIZE\t0\n\n" | |||
| "#define L3_LINESIZE\t0\n\n" | |||
| "#define L3_ASSOCIATIVE\t0\n\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_VFPV4\n" | |||
| "#define HAVE_VFPV3\n" | |||
| "#define HAVE_VFP\n" | |||
| "#define HAVE_NEON\n" | |||
| "#define HAVE_SVE\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 4) | |||
| set(SGEMM_UNROLL_N 8) | |||
| set(DGEMM_UNROLL_M 2) | |||
| set(DGEMM_UNROLL_N 8) | |||
| set(CGEMM_UNROLL_M 2) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "P5600") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L2_SIZE 1048576\n" | |||
| @@ -310,6 +310,18 @@ if (${TARGET} STREQUAL NEOVERSEV1) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL A64FX) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve -mtune=a64fx") | |||
| else () | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve -mtune=a64fx") | |||
| else () | |||
| message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support A64FX.") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| endif() | |||
| @@ -379,6 +391,13 @@ endif () | |||
| if (X86_64 OR ${CORE} STREQUAL POWER10) | |||
| set(SMALL_MATRIX_OPT TRUE) | |||
| endif () | |||
| if (ARM64) | |||
| set(GEMM_GEMV_FORWARD TRUE) | |||
| endif () | |||
| if (GEMM_GEMV_FORWARD AND NOT ONLY_CBLAS) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DGEMM_GEMV_FORWARD") | |||
| endif () | |||
| if (SMALL_MATRIX_OPT) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT") | |||
| endif () | |||
| @@ -26,7 +26,7 @@ endif | |||
| override CFLAGS += -DADD$(BU) -DCBLAS | |||
| ifeq ($(F_COMPILER),GFORTRAN) | |||
| ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) | |||
| override FFLAGS = $(filter_out(-O2 -O3,$(FFLAGS)) -O0 | |||
| override FFLAGS = $(filter_out(-O2 -O3,$(FFLAGS))) -O0 | |||
| endif | |||
| override FFLAGS += -fno-tree-vectorize | |||
| endif | |||
| @@ -245,7 +245,7 @@ newer installed. | |||
| On Windows 11 with Visual Studio 2022, this would be done by invoking: | |||
| ```shell | |||
| "c:\Program Files\Microsoft Visual Studio\2022\Preview\vc\Auxiliary\Build\vcvars64.bat" | |||
| "c:\Program Files\Microsoft Visual Studio\2022\Community\vc\Auxiliary\Build\vcvars64.bat" | |||
| ``` | |||
| With VS2019, the command should be the same (except for the year number of course). | |||
| @@ -30,12 +30,16 @@ else | |||
| ifeq ($(ARCH),loongarch64) | |||
| COMMONOBJS += dynamic_loongarch64.$(SUFFIX) | |||
| else | |||
| ifeq ($(ARCH),riscv64) | |||
| COMMONOBJS += dynamic_riscv64.$(SUFFIX) detect_riscv64.$(SUFFIX) | |||
| else | |||
| COMMONOBJS += dynamic.$(SUFFIX) | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| COMMONOBJS += parameter.$(SUFFIX) | |||
| endif | |||
| @@ -106,12 +110,16 @@ else | |||
| ifeq ($(ARCH),loongarch64) | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_loongarch64.$(SUFFIX) | |||
| else | |||
| ifeq ($(ARCH),riscv64) | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_riscv64.$(SUFFIX) detect_riscv64.$(SUFFIX) | |||
| else | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | |||
| endif | |||
| @@ -209,6 +217,9 @@ addx.$(SUFFIX) : $(ARCH)/addx.c | |||
| mulx.$(SUFFIX) : $(ARCH)/mulx.c | |||
| $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F) | |||
| detect_riscv64.$(SUFFIX): detect_riscv64.c | |||
| $(CC) $(CFLAGS) -c -march=rv64imafdcv $< -o $(@F) | |||
| xerbla.$(PSUFFIX) : xerbla.c | |||
| $(CC) $(PFLAGS) -c $< -o $(@F) | |||
| @@ -0,0 +1,75 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdint.h> | |||
| #ifdef __riscv_v_intrinsic | |||
| #include <riscv_vector.h> | |||
| #endif | |||
| unsigned detect_riscv64_get_vlenb(void) { | |||
| #ifdef __riscv_v_intrinsic | |||
| return __riscv_vlenb(); | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| /* | |||
| * Based on the approach taken here: | |||
| * https://code.videolan.org/videolan/dav1d/-/merge_requests/1629 | |||
| * | |||
| * Only to be called after we've determined we have some sort of | |||
| * RVV support. | |||
| */ | |||
| uint64_t detect_riscv64_rvv100(void) | |||
| { | |||
| uint64_t rvv10_supported; | |||
| /* | |||
| * After the vsetvli statement vtype will either be a value > 0 if the | |||
| * vsetvli succeeded or less than 0 if it failed. If 0 < vtype | |||
| * we're good and the function will return 1, otherwise there's no | |||
| * RVV 1.0 and we return 0. | |||
| */ | |||
| asm volatile("vsetvli x0, x0, e8, m1, ta, ma\n\t" | |||
| "csrr %0, vtype\n\t" | |||
| "slt %0, x0, %0\n" | |||
| : "=r" (rvv10_supported) | |||
| : | |||
| :); | |||
| return rvv10_supported; | |||
| } | |||
| @@ -120,6 +120,11 @@ extern gotoblas_t gotoblas_CORTEXA55; | |||
| #else | |||
| #define gotoblas_CORTEXA55 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_A64FX | |||
| extern gotoblas_t gotoblas_A64FX; | |||
| #else | |||
| #define gotoblas_A64FX gotoblas_ARMV8 | |||
| #endif | |||
| #else | |||
| extern gotoblas_t gotoblas_CORTEXA53; | |||
| #define gotoblas_CORTEXA55 gotoblas_CORTEXA53 | |||
| @@ -136,10 +141,12 @@ extern gotoblas_t gotoblas_NEOVERSEN1; | |||
| extern gotoblas_t gotoblas_NEOVERSEV1; | |||
| extern gotoblas_t gotoblas_NEOVERSEN2; | |||
| extern gotoblas_t gotoblas_ARMV8SVE; | |||
| extern gotoblas_t gotoblas_A64FX; | |||
| #else | |||
| #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 | |||
| #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | |||
| #define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
| #define gotoblas_A64FX gotoblas_ARMV8 | |||
| #endif | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| #endif | |||
| @@ -149,7 +156,7 @@ extern void openblas_warning(int verbose, const char * msg); | |||
| #define FALLBACK_VERBOSE 1 | |||
| #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" | |||
| #define NUM_CORETYPES 17 | |||
| #define NUM_CORETYPES 18 | |||
| /* | |||
| * In case asm/hwcap.h is outdated on the build system, make sure | |||
| @@ -184,6 +191,7 @@ static char *corename[] = { | |||
| "thunderx3t110", | |||
| "cortexa55", | |||
| "armv8sve", | |||
| "a64fx", | |||
| "unknown" | |||
| }; | |||
| @@ -205,6 +213,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14]; | |||
| if (gotoblas == &gotoblas_CORTEXA55) return corename[15]; | |||
| if (gotoblas == &gotoblas_ARMV8SVE) return corename[16]; | |||
| if (gotoblas == &gotoblas_A64FX) return corename[17]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| @@ -241,6 +250,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||
| case 14: return (&gotoblas_THUNDERX3T110); | |||
| case 15: return (&gotoblas_CORTEXA55); | |||
| case 16: return (&gotoblas_ARMV8SVE); | |||
| case 17: return (&gotoblas_A64FX); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| @@ -346,6 +356,15 @@ static gotoblas_t *get_coretype(void) { | |||
| return &gotoblas_THUNDERX3T110; | |||
| } | |||
| break; | |||
| case 0x46: // Fujitsu | |||
| switch (part) | |||
| { | |||
| #ifndef NO_SVE | |||
| case 0x001: // A64FX | |||
| return &gotoblas_A64FX; | |||
| #endif | |||
| } | |||
| break; | |||
| case 0x48: // HiSilicon | |||
| switch (part) | |||
| { | |||
| @@ -0,0 +1,269 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdbool.h> | |||
| #include "common.h" | |||
| /* | |||
| * OpenBLAS contains some kernels that are optimised for RVV 1.0. Before we | |||
| * can use these kernels we need to determine whether the device supports | |||
| * RVV 1.0 and what the device's VLEN is. Our strategy will be as follows. | |||
| * | |||
| * First we'll invoke the hwprobe syscall to detect RVV 1.0. In an ideal world, | |||
| * this is all we should need to do. If the syscall is not implemented we | |||
| * should be able to deduce that RVV 1.0 is not supported (as it was added to | |||
| * Linux after hwprobe) and if the syscall is implemented we can use it to | |||
| * determine whether RVV 1.0 is supported. However, there are some riscv64 | |||
| * boards out there that implement RVV 1.0 but ship with a Linux kernel that | |||
| * predates RVV vector support and hwprobe support. These kernels contain | |||
| * the backported RVV patches but not the hwprobe patches and so they | |||
| * advertise support for RVV via hwcap. To cater for these boards we need | |||
| * to fall back to hwcap if hwprobe is not supported. Unfortunately, some | |||
| * boards indicate support for RVV via hwcap even though they only support | |||
| * RVV 0.7.1, which is incompatible with RVV 1.0. So an additional check is | |||
| * required to test if the devices advertising support for RVV via hwcap really | |||
| * support RVV 1.0. This test works by executing a vsetvli instruction that | |||
| * sets the tail agnostic and mask agnostic bits in the vtype register. | |||
| * These bits are not supported prior to RVV 0.9 so will cause the VIL bit to | |||
| * be set on the VTYPE register in CPUs supporting 0.7.1. If this bit is set | |||
| * we can determine that RVV 1.0 is not supported. | |||
| * | |||
| * This approach is borrowed from | |||
| * VideoLan dav1d: | |||
| * (https://code.videolan.org/videolan/dav1d/-/merge_requests/1629). | |||
| * | |||
| * We assume that if a kernel reports the presence of RVV via hwcap that | |||
| * the device supports the vsetvli instruction. | |||
| * | |||
| * For now we're just going to invoke the hwprobe syscall directly, rather than | |||
| * invoking it through glibc. Support for hwprobe has been added to glibc but | |||
| * at the time of writing this support has not yet been included in a glibc | |||
| * release. Once it has, it will be better to invoke hwprobe via glibc as doing | |||
| * so should take advantage of the vdso entry and be more efficient. | |||
| */ | |||
| /* | |||
| * This should work on Android as well but I have no way of testing. | |||
| */ | |||
| #if defined(OS_LINUX) | |||
| #include <unistd.h> | |||
| #include <sys/syscall.h> | |||
| #include <stdint.h> | |||
| #include <sys/auxv.h> | |||
| #define DETECT_RISCV64_HWCAP_ISA_V (1 << ('V' - 'A')) | |||
| struct riscv_hwprobe { | |||
| int64_t key; | |||
| uint64_t value; | |||
| }; | |||
| /* The constants below are copied from | |||
| * /usr/include/riscv64-linux-gnu/asm/hwprobe.h. We duplicate the | |||
| * constants as the header file from which they are copied will only | |||
| * be present if we're building on a device with Linux 6.5 or greater. | |||
| */ | |||
| #define RISCV_HWPROBE_KEY_IMA_EXT_0 4 | |||
| #define RISCV_HWPROBE_IMA_V (1 << 2) | |||
| #ifndef NR_riscv_hwprobe | |||
| #ifndef NR_arch_specific_syscall | |||
| #define NR_arch_specific_syscall 244 | |||
| #endif | |||
| #define NR_riscv_hwprobe (NR_arch_specific_syscall + 14) | |||
| #endif | |||
| #endif // defined(OS_LINUX) | |||
| unsigned detect_riscv64_get_vlenb(void); | |||
| uint64_t detect_riscv64_rvv100(void); | |||
| extern gotoblas_t gotoblas_RISCV64_GENERIC; | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B) | |||
| extern gotoblas_t gotoblas_RISCV64_ZVL256B; | |||
| #endif | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B) | |||
| extern gotoblas_t gotoblas_RISCV64_ZVL128B; | |||
| #endif | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_RISCV64_ZVL256B 1 | |||
| #define CPU_RISCV64_ZVL128B 2 | |||
| static char *cpuname[] = { | |||
| "riscv64_generic", | |||
| "riscv64_zvl256b", | |||
| "riscv64_zvl128b" | |||
| }; | |||
| #define NUM_CORETYPES (sizeof(cpuname)/sizeof(char*)) | |||
| extern int openblas_verbose(void); | |||
| extern void openblas_warning(int verbose, const char* msg); | |||
| char* gotoblas_corename(void) { | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B) | |||
| if (gotoblas == &gotoblas_RISCV64_ZVL256B) | |||
| return cpuname[CPU_RISCV64_ZVL256B]; | |||
| #endif | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B) | |||
| if (gotoblas == &gotoblas_RISCV64_ZVL128B) | |||
| return cpuname[CPU_RISCV64_ZVL128B]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_RISCV64_GENERIC) | |||
| return cpuname[CPU_GENERIC]; | |||
| return "unknown"; | |||
| } | |||
| static gotoblas_t* get_coretype(void) { | |||
| unsigned vlenb = 0; | |||
| #if !defined(OS_LINUX) | |||
| return NULL; | |||
| #else | |||
| /* | |||
| * See the hwprobe documentation | |||
| * | |||
| * ( https://docs.kernel.org/arch/riscv/hwprobe.html ) | |||
| * for more details. | |||
| */ | |||
| struct riscv_hwprobe pairs[] = { | |||
| { .key = RISCV_HWPROBE_KEY_IMA_EXT_0, }, | |||
| }; | |||
| int ret = syscall(NR_riscv_hwprobe, pairs, 1, 0, NULL, 0); | |||
| if (ret == 0) { | |||
| if (!(pairs[0].value & RISCV_HWPROBE_IMA_V)) | |||
| return NULL; | |||
| } else { | |||
| if (!(getauxval(AT_HWCAP) & DETECT_RISCV64_HWCAP_ISA_V)) | |||
| return NULL; | |||
| if (!detect_riscv64_rvv100()) | |||
| return NULL; | |||
| } | |||
| /* | |||
| * RVV 1.0 is supported. We now just need to determine the coretype | |||
| * based on the VLEN. | |||
| */ | |||
| vlenb = detect_riscv64_get_vlenb(); | |||
| if (vlenb < 16) | |||
| return NULL; | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B) | |||
| if (vlenb >= 32) | |||
| return &gotoblas_RISCV64_ZVL256B; | |||
| #endif | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B) | |||
| return &gotoblas_RISCV64_ZVL128B; | |||
| #else | |||
| return NULL; | |||
| #endif | |||
| #endif // !defined(OS_LINUX) | |||
| } | |||
| static gotoblas_t* force_coretype(char* coretype) { | |||
| size_t i; | |||
| char message[128]; | |||
| for (i = 0; i < NUM_CORETYPES && strcasecmp(coretype, cpuname[i]); i++); | |||
| if (i == CPU_GENERIC) | |||
| return &gotoblas_RISCV64_GENERIC; | |||
| if (i == CPU_RISCV64_ZVL256B) { | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B) | |||
| return &gotoblas_RISCV64_ZVL256B; | |||
| #else | |||
| openblas_warning(1, | |||
| "riscv64_zvl256b support not compiled in\n"); | |||
| return NULL; | |||
| #endif | |||
| } | |||
| if (i == CPU_RISCV64_ZVL128B) { | |||
| #if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B) | |||
| return &gotoblas_RISCV64_ZVL128B; | |||
| #else | |||
| openblas_warning(1, | |||
| "riscv64_zvl128b support not compiled in\n"); | |||
| return NULL; | |||
| #endif | |||
| } | |||
| snprintf(message, sizeof(message), "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| return NULL; | |||
| } | |||
| void gotoblas_dynamic_init(void) { | |||
| char coremsg[128]; | |||
| char* p; | |||
| if (gotoblas) return; | |||
| p = getenv("OPENBLAS_CORETYPE"); | |||
| if (p) | |||
| gotoblas = force_coretype(p); | |||
| else | |||
| gotoblas = get_coretype(); | |||
| if (!gotoblas) { | |||
| snprintf(coremsg, sizeof(coremsg), "Falling back to generic riscv64 core\n"); | |||
| openblas_warning(1, coremsg); | |||
| gotoblas = &gotoblas_RISCV64_GENERIC; | |||
| } | |||
| if (gotoblas->init) { | |||
| snprintf(coremsg, sizeof(coremsg), "Core: %s\n", | |||
| gotoblas_corename()); | |||
| openblas_warning(2, coremsg); | |||
| gotoblas->init(); | |||
| return; | |||
| } | |||
| openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||
| exit(1); | |||
| } | |||
| void gotoblas_dynamic_quit(void) { | |||
| gotoblas = NULL; | |||
| } | |||
| @@ -1,4 +1,5 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2024 The OpenBLAS Project */ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| @@ -47,12 +48,16 @@ | |||
| #define SMP_THRESHOLD_MIN 65536.0 | |||
| #ifdef XDOUBLE | |||
| #define ERROR_NAME "QGEMM " | |||
| #define GEMV BLASFUNC(qgemv) | |||
| #elif defined(DOUBLE) | |||
| #define ERROR_NAME "DGEMM " | |||
| #define GEMV BLASFUNC(dgemv) | |||
| #elif defined(BFLOAT16) | |||
| #define ERROR_NAME "SBGEMM " | |||
| #define GEMV BLASFUNC(sbgemv) | |||
| #else | |||
| #define ERROR_NAME "SGEMM " | |||
| #define GEMV BLASFUNC(sgemv) | |||
| #endif | |||
| #else | |||
| #define SMP_THRESHOLD_MIN 8192.0 | |||
| @@ -493,6 +498,52 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| args.m, args.n, args.k, args.lda, args.ldb, args.ldc); | |||
| #endif | |||
| #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) | |||
| // Check if we can convert GEMM -> GEMV | |||
| if (args.k != 0) { | |||
| if (args.n == 1) { | |||
| blasint inc_x = 1; | |||
| blasint inc_y = 1; | |||
| // These were passed in as blasint, but the struct translates them to blaslong | |||
| blasint m = args.m; | |||
| blasint n = args.k; | |||
| blasint lda = args.lda; | |||
| // Create new transpose parameters | |||
| char NT = 'N'; | |||
| if (transa & 1) { | |||
| NT = 'T'; | |||
| m = args.k; | |||
| n = args.m; | |||
| } | |||
| if (transb & 1) { | |||
| inc_x = args.ldb; | |||
| } | |||
| GEMV(&NT, &m, &n, args.alpha, args.a, &lda, args.b, &inc_x, args.beta, args.c, &inc_y); | |||
| return; | |||
| } | |||
| if (args.m == 1) { | |||
| blasint inc_x = args.lda; | |||
| blasint inc_y = args.ldc; | |||
| // These were passed in as blasint, but the struct translates them to blaslong | |||
| blasint m = args.k; | |||
| blasint n = args.n; | |||
| blasint ldb = args.ldb; | |||
| // Create new transpose parameters | |||
| char NT = 'T'; | |||
| if (transa & 1) { | |||
| inc_x = 1; | |||
| } | |||
| if (transb & 1) { | |||
| NT = 'N'; | |||
| m = args.n; | |||
| n = args.k; | |||
| } | |||
| GEMV(&NT, &m, &n, args.alpha, args.b, &ldb, args.a, &inc_x, args.beta, args.c, &inc_y); | |||
| return; | |||
| } | |||
| } | |||
| #endif | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -85,7 +85,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ | |||
| if (nthreads == 1) { | |||
| #endif | |||
| SCAL_K(n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0); | |||
| SCAL_K(n, 0, 0, alpha, x, incx, NULL, 0, NULL, 1); | |||
| #ifdef SMP | |||
| } else { | |||
| @@ -102,7 +102,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ | |||
| #else | |||
| &alpha, | |||
| #endif | |||
| x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); | |||
| x, incx, NULL, 0, NULL, 1, (int (*)(void))SCAL_K, nthreads); | |||
| } | |||
| #endif | |||
| @@ -43,9 +43,22 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( (n <= 0) || (inc_x <= 0)) | |||
| return(0); | |||
| if (dummy2 == 0) { | |||
| while(j < n) | |||
| { | |||
| while(j < n) | |||
| { | |||
| if ( da == 0.0 ) | |||
| x[i]=0.0; | |||
| else | |||
| x[i] = da * x[i] ; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } else { | |||
| while(j < n) | |||
| { | |||
| if ( da == 0.0 ) | |||
| if (!isnan(x[i]) && !isinf(x[i])) { | |||
| @@ -59,6 +72,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| return 0; | |||
| @@ -1 +1,6 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
| SGEMVNKERNEL = gemv_n_sve.c | |||
| DGEMVNKERNEL = gemv_n_sve.c | |||
| SGEMVTKERNEL = gemv_t_sve.c | |||
| DGEMVTKERNEL = gemv_t_sve.c | |||
| @@ -131,6 +131,16 @@ SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_sve.c | |||
| SGEMM_SMALL_K_NT = sgemm_small_kernel_nt_sve.c | |||
| SGEMM_SMALL_K_B0_NT = sgemm_small_kernel_nt_sve.c | |||
| SGEMM_SMALL_K_NN = sgemm_small_kernel_nn_sve.c | |||
| SGEMM_SMALL_K_B0_NN = sgemm_small_kernel_nn_sve.c | |||
| SGEMM_SMALL_K_TT = sgemm_small_kernel_tt_sve.c | |||
| SGEMM_SMALL_K_B0_TT = sgemm_small_kernel_tt_sve.c | |||
| SGEMM_SMALL_K_TN = sgemm_small_kernel_tn_sve.c | |||
| SGEMM_SMALL_K_B0_TN = sgemm_small_kernel_tn_sve.c | |||
| STRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| STRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| STRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| @@ -152,6 +162,16 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMM_SMALL_M_PERMIT = gemm_small_kernel_permit_sve.c | |||
| DGEMM_SMALL_K_NT = dgemm_small_kernel_nt_sve.c | |||
| DGEMM_SMALL_K_B0_NT = dgemm_small_kernel_nt_sve.c | |||
| DGEMM_SMALL_K_NN = dgemm_small_kernel_nn_sve.c | |||
| DGEMM_SMALL_K_B0_NN = dgemm_small_kernel_nn_sve.c | |||
| DGEMM_SMALL_K_TT = dgemm_small_kernel_tt_sve.c | |||
| DGEMM_SMALL_K_B0_TT = dgemm_small_kernel_tt_sve.c | |||
| DGEMM_SMALL_K_TN = dgemm_small_kernel_tn_sve.c | |||
| DGEMM_SMALL_K_B0_TN = dgemm_small_kernel_tn_sve.c | |||
| DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c | |||
| DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c | |||
| DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c | |||
| @@ -1 +1,4 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
| SGEMVTKERNEL = gemv_t_sve.c | |||
| DGEMVTKERNEL = gemv_t_sve.c | |||
| @@ -0,0 +1,742 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #include <arm_sve.h> | |||
| #if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ | |||
| __has_include(<arm_neon_sve_bridge.h>) | |||
| #include <arm_neon_sve_bridge.h> | |||
| #else | |||
| #define svdup_neonq_f32(fixed_reg) \ | |||
| ({ \ | |||
| svfloat32_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #define svdup_neonq_f64(fixed_reg) \ | |||
| ({ \ | |||
| svfloat64_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #endif | |||
| #define RESET_A_POINTER() a_offset = A; | |||
| #define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale; | |||
| #define UPDATE_A_POINTER(scale) a_offset = a_offset + scale; | |||
| #define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k) * lda) | |||
| #define A_ELEMENT(m) A_ELEMENT_K(m, 0) | |||
| #define RESET_B_POINTER() b_offset = B; | |||
| #define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale * ldb; | |||
| #define UPDATE_B_POINTER(scale) b_offset = b_offset + scale * ldb; | |||
| #define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k)) | |||
| #define B_ELEMENT(n) B_ELEMENT_K(n, 0) | |||
| #define CREATE_C_POINTER(n, scale) FLOAT* c_offset##n = c_offset + scale * ldc; | |||
| #define INCR_C_POINTER(m, incr) // c_offset ## m += incr; | |||
| #define UPDATE_C_POINTER(scale) c_offset = c_offset + scale * ldc; | |||
| #define C_ELEMENT(m, n) *(c_offset##n + ((m * v_size) + i)) | |||
| // #undef C_ELEMENT | |||
| // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] | |||
| #define PACK_ELEMENT_K(n, offset_k) packed_b[(k + offset_k) * 4 + n] | |||
| #define PACK_ELEMENT(n) PACK_ELEMENT_K(n, 0) | |||
| // ASIMD | |||
| #define DECLARE_RESULT_VECTOR2(m, n) \ | |||
| float64x2_t result##m##n = vdupq_n_f64(0.0); | |||
| #define DECLARE_RESULT(m, n) float64_t result##m##n = 0.0; | |||
| #define BROADCAST_LOAD_A2(m, offset_k) \ | |||
| float64x2_t a##m##_k##offset_k = vld1q_dup_f64(&A_ELEMENT_K(m, offset_k)); | |||
| #define LOAD_A1(m, offset_k) \ | |||
| float64_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); | |||
| #define VECTOR_LOAD_B_K2(n, offset_k) \ | |||
| float64x2_t b##k##n##_k##offset_k = vld1q_f64(&B_ELEMENT_K(n, offset_k)); | |||
| #define TRANSPOSE_B2_K2(n0, n1, offset_k0, offset_k1) \ | |||
| float64x2_t b##n0##_k##offset_k0 = \ | |||
| vzip1q_f64(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0); \ | |||
| float64x2_t b##n0##_k##offset_k1 = \ | |||
| vzip2q_f64(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0); | |||
| #define SCALE_B2_K2(n0, offset_k0, offset_k1) \ | |||
| svfloat64_t b##s##n0##_k##offset_k0 = svdup_neonq_f64(b##n0##_k##offset_k0); \ | |||
| svfloat64_t b##s##n0##_k##offset_k1 = svdup_neonq_f64(b##n0##_k##offset_k1); | |||
| #define GATHER_LOAD_B2(n, offset_k) \ | |||
| float64x2_t b##n##_k##offset_k = vdupq_n_f64(B_ELEMENT_K(n, offset_k)); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f64(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); | |||
| #define VECTOR_UNPACK_B2(n, offset_k) \ | |||
| float64x2_t b##n##_k##offset_k = vld1q_f64(&PACK_ELEMENT_K(n, offset_k)); | |||
| #define VECTOR_PACK_B2(n, offset_k) \ | |||
| vst1q_f64(&PACK_ELEMENT_K(n, offset_k), b##n##_k##offset_k); | |||
| #define PACK_B0(n, offset_k) \ | |||
| PACK_ELEMENT_K(n, offset_k) = vget_lane_f64(b##n##_k##offset_k, 0); | |||
| #define UPDATE_RESULT_VECTOR2(m, n, offset_k) \ | |||
| result##m##n = \ | |||
| vfmaq_f64(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); | |||
| #define UPDATE_RESULT(m, n, offset_k) \ | |||
| result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; | |||
| #ifdef B0 | |||
| #define SCATTER_STORE2(m, n) \ | |||
| result##m##n = vmulq_f64(result##m##n, vdupq_n_f64(alpha)); \ | |||
| C_ELEMENT(m, n + 0) = vgetq_lane_f64(result##m##n, 0); \ | |||
| C_ELEMENT(m, n + 1) = vgetq_lane_f64(result##m##n, 1); | |||
| #else | |||
| #define SCATTER_STORE2(m, n) \ | |||
| result##m##n = vmulq_f64(result##m##n, vdupq_n_f64(alpha)); \ | |||
| C_ELEMENT(m, n + 0) = \ | |||
| C_ELEMENT(m, n + 0) * beta + vgetq_lane_f64(result##m##n, 0); \ | |||
| C_ELEMENT(m, n + 1) = \ | |||
| C_ELEMENT(m, n + 1) * beta + vgetq_lane_f64(result##m##n, 1); | |||
| #endif | |||
| // SVE | |||
| #define DECLARE_RESULT_VECTOR(m, n) svfloat64_t result##m##n = svdup_f64(0.0); | |||
| #define BROADCAST_LOAD_A(m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = svdup_f64(A_ELEMENT_K(m, offset_k)); | |||
| #define BROADCAST_LOAD_B(n, offset_k) \ | |||
| svfloat64_t b##s##n##_k##offset_k = svdup_f64(B_ELEMENT_K(n, offset_k)); | |||
| #define VECTOR_LOAD_A(pg, m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); | |||
| #define QUADWORD_LOAD_B(n, offset_k) \ | |||
| svfloat64_t b##s##n##_k##offset_k = \ | |||
| svld1rq(pg_true, &B_ELEMENT_K(n, offset_k)); | |||
| #define PACK_B(n, offset_k) \ | |||
| svst1(pg_first, &PACK_ELEMENT_K(n, offset_k), b##s##n##_k##offset_k); | |||
| #define VECTOR_PACK_B(n, offset_k) \ | |||
| svst1(pg_true, &PACK_ELEMENT_K(n* v_size, offset_k), b##s##n##_k##offset_k); | |||
| #define QUADWORD_PACK_B(n, offset_k) \ | |||
| svst1(pg_quad, &PACK_ELEMENT_K(n, offset_k), b##s##n##_k##offset_k); | |||
| #define UNPACK_VECTOR_B(n, offset_k) \ | |||
| svfloat64_t b##s##n##_k##offset_k = \ | |||
| svld1(pg_true, &PACK_ELEMENT_K(n * v_size, offset_k)); | |||
| #define UNPACK_BROADCAST_B(n, offset_k) \ | |||
| svfloat64_t b##s##n##_k##offset_k = svdup_f64(PACK_ELEMENT_K(n, offset_k)); | |||
| #define UNPACK_QUADWORD_B(n, offset_k) \ | |||
| svfloat64_t b##s##n##_k##offset_k = \ | |||
| svld1rq(pg_true, &PACK_ELEMENT_K(n, offset_k)); | |||
| #define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); | |||
| #define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ | |||
| result##m##n = svmla_lane( \ | |||
| result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); | |||
| #ifdef B0 | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #else | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = svmla_m(pg, \ | |||
| result##m##n, \ | |||
| svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ | |||
| beta_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #endif | |||
| #ifndef LIKELY | |||
| #ifdef __GNUC__ | |||
| #define LIKELY(x) __builtin_expect(!!(x), 1) | |||
| #else | |||
| #define LIKELY(x) (x) | |||
| #endif | |||
| #endif | |||
| #ifdef B0 | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #else | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT beta, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #endif | |||
| { | |||
| const uint64_t v_size = svcntd(); | |||
| const uint64_t v_size2 = v_size * 2; | |||
| const svbool_t pg_true = svptrue_b64(); | |||
| const svbool_t pg_quad = svwhilelt_b64(0, 2); | |||
| const svbool_t pg_first = svwhilelt_b64(0, 1); | |||
| const svfloat64_t alpha_vec = svdup_f64(alpha); | |||
| #ifndef B0 | |||
| const svfloat64_t beta_vec = svdup_f64(beta); | |||
| #endif | |||
| const BLASLONG n4 = N & -4; | |||
| const BLASLONG n2 = N & -2; | |||
| const BLASLONG v_m2 = M & -v_size2; | |||
| const BLASLONG v_m1 = M & -v_size; | |||
| const BLASLONG k2 = K & -2; | |||
| const int pack_b = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
| FLOAT* packed_b = | |||
| (pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL; | |||
| FLOAT* b_offset = B; | |||
| FLOAT* a_offset = A; | |||
| FLOAT* c_offset = C; | |||
| BLASLONG j = 0; | |||
| for (; j < n4; j += 4) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_C_POINTER(1, 1); | |||
| CREATE_C_POINTER(2, 2); | |||
| CREATE_C_POINTER(3, 3); | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| UPDATE_A_POINTER(v_size2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| if (LIKELY(packed_b != NULL)) { | |||
| if (i == 0) { | |||
| for (; k < k2; k += 2) { | |||
| VECTOR_LOAD_B_K2(0, 0); | |||
| VECTOR_LOAD_B_K2(1, 0); | |||
| TRANSPOSE_B2_K2(0, 1, 0, 1); | |||
| SCALE_B2_K2(0, 0, 1); | |||
| VECTOR_PACK_B2(0, 0); | |||
| VECTOR_PACK_B2(0, 1); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); | |||
| VECTOR_LOAD_B_K2(2, 0); | |||
| VECTOR_LOAD_B_K2(3, 0); | |||
| TRANSPOSE_B2_K2(2, 3, 0, 1); | |||
| SCALE_B2_K2(2, 0, 1); | |||
| VECTOR_PACK_B2(2, 0); | |||
| VECTOR_PACK_B2(2, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 1); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 1); | |||
| } | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| PACK_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| PACK_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| PACK_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| PACK_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| UNPACK_QUADWORD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UNPACK_QUADWORD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0); | |||
| } | |||
| } | |||
| } else { | |||
| for (; k < k2; k += 2) { | |||
| VECTOR_LOAD_B_K2(0, 0); | |||
| VECTOR_LOAD_B_K2(1, 0); | |||
| TRANSPOSE_B2_K2(0, 1, 0, 1); | |||
| SCALE_B2_K2(0, 0, 1); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); | |||
| VECTOR_LOAD_B_K2(2, 0); | |||
| VECTOR_LOAD_B_K2(3, 0); | |||
| TRANSPOSE_B2_K2(2, 3, 0, 1); | |||
| SCALE_B2_K2(2, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 1); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 1); | |||
| } | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| VECTOR_STORE(pg_true, 1, 2); | |||
| VECTOR_STORE(pg_true, 1, 3); | |||
| INCR_C_POINTER(0, v_size2); | |||
| INCR_C_POINTER(1, v_size2); | |||
| INCR_C_POINTER(2, v_size2); | |||
| INCR_C_POINTER(3, v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(v_size); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| if (LIKELY(packed_b != NULL)) { | |||
| for (; k < K; k++) { | |||
| UNPACK_QUADWORD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UNPACK_QUADWORD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); | |||
| } | |||
| } else { | |||
| for (; k < k2; k += 2) { | |||
| VECTOR_LOAD_B_K2(0, 0); | |||
| VECTOR_LOAD_B_K2(1, 0); | |||
| TRANSPOSE_B2_K2(0, 1, 0, 1); | |||
| SCALE_B2_K2(0, 0, 1); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); | |||
| VECTOR_LOAD_B_K2(2, 0); | |||
| VECTOR_LOAD_B_K2(3, 0); | |||
| TRANSPOSE_B2_K2(2, 3, 0, 1); | |||
| SCALE_B2_K2(2, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 1); | |||
| } | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| INCR_C_POINTER(0, v_size); | |||
| INCR_C_POINTER(1, v_size); | |||
| INCR_C_POINTER(2, v_size); | |||
| INCR_C_POINTER(3, v_size); | |||
| } | |||
| for (; i < M; i += v_size) { | |||
| const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(0); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| if (LIKELY(packed_b != NULL)) { | |||
| for (; k < K; k++) { | |||
| UNPACK_QUADWORD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UNPACK_QUADWORD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); | |||
| } | |||
| } else { | |||
| for (; k < k2; k += 2) { | |||
| VECTOR_LOAD_B_K2(0, 0); | |||
| VECTOR_LOAD_B_K2(1, 0); | |||
| TRANSPOSE_B2_K2(0, 1, 0, 1); | |||
| SCALE_B2_K2(0, 0, 1); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); | |||
| VECTOR_LOAD_B_K2(2, 0); | |||
| VECTOR_LOAD_B_K2(3, 0); | |||
| TRANSPOSE_B2_K2(2, 3, 0, 1); | |||
| SCALE_B2_K2(2, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 1); | |||
| } | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| VECTOR_STORE(pg_tail, 0, 1); | |||
| VECTOR_STORE(pg_tail, 0, 2); | |||
| VECTOR_STORE(pg_tail, 0, 3); | |||
| INCR_C_POINTER(0, 0); | |||
| INCR_C_POINTER(1, 0); | |||
| INCR_C_POINTER(2, 0); | |||
| INCR_C_POINTER(3, 0); | |||
| } | |||
| UPDATE_B_POINTER(4); | |||
| RESET_A_POINTER(); | |||
| UPDATE_C_POINTER(4); | |||
| } | |||
| for (; j < n2; j += 2) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_C_POINTER(1, 1); | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| UPDATE_A_POINTER(v_size2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| for (; k < k2; k += 2) { | |||
| VECTOR_LOAD_B_K2(0, 0); | |||
| VECTOR_LOAD_B_K2(1, 0); | |||
| TRANSPOSE_B2_K2(0, 1, 0, 1); | |||
| SCALE_B2_K2(0, 0, 1); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); | |||
| } | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| INCR_C_POINTER(0, v_size2); | |||
| INCR_C_POINTER(1, v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(v_size); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| for (; k < k2; k += 2) { | |||
| VECTOR_LOAD_B_K2(0, 0); | |||
| VECTOR_LOAD_B_K2(1, 0); | |||
| TRANSPOSE_B2_K2(0, 1, 0, 1); | |||
| SCALE_B2_K2(0, 0, 1); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); | |||
| } | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| INCR_C_POINTER(0, v_size); | |||
| INCR_C_POINTER(1, v_size); | |||
| } | |||
| for (; i < M; i += v_size) { | |||
| const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(0); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| for (; k < k2; k += 2) { | |||
| VECTOR_LOAD_B_K2(0, 0); | |||
| VECTOR_LOAD_B_K2(1, 0); | |||
| TRANSPOSE_B2_K2(0, 1, 0, 1); | |||
| SCALE_B2_K2(0, 0, 1); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); | |||
| } | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| VECTOR_STORE(pg_tail, 0, 1); | |||
| INCR_C_POINTER(0, 0); | |||
| INCR_C_POINTER(1, 0); | |||
| } | |||
| UPDATE_B_POINTER(2); | |||
| RESET_A_POINTER(); | |||
| UPDATE_C_POINTER(2); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_B_POINTER(0, 0); | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| UPDATE_A_POINTER(v_size2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| INCR_C_POINTER(0, v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(v_size); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| INCR_C_POINTER(0, v_size); | |||
| } | |||
| for (; i < M; i += v_size) { | |||
| const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(0); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| INCR_C_POINTER(0, 0); | |||
| } | |||
| UPDATE_B_POINTER(1); | |||
| RESET_A_POINTER(); | |||
| UPDATE_C_POINTER(1); | |||
| } | |||
| if (pack_b) | |||
| free(packed_b); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,474 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #include <arm_sve.h> | |||
| #if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ | |||
| __has_include(<arm_neon_sve_bridge.h>) | |||
| #include <arm_neon_sve_bridge.h> | |||
| #else | |||
| #define svdup_neonq_f32(fixed_reg) \ | |||
| ({ \ | |||
| svfloat32_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #define svdup_neonq_f64(fixed_reg) \ | |||
| ({ \ | |||
| svfloat64_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #endif | |||
| #define RESET_A_POINTER() a_offset = A; | |||
| #define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale; | |||
| #define UPDATE_A_POINTER(scale) a_offset = a_offset + scale; | |||
| #define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k) * lda) | |||
| #define A_ELEMENT(m) A_ELEMENT_K(m, 0) | |||
| #define RESET_B_POINTER() b_offset = B; | |||
| #define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale; | |||
| #define UPDATE_B_POINTER(scale) b_offset = b_offset + scale; | |||
| #define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k) * ldb) | |||
| #define B_ELEMENT(n) B_ELEMENT_K(n, 0) | |||
| #define CREATE_C_POINTER(n, scale) FLOAT* c_offset##n = c_offset + scale * ldc; | |||
| #define INCR_C_POINTER(m, incr) // c_offset ## m += incr; | |||
| #define UPDATE_C_POINTER(scale) c_offset = c_offset + scale * ldc; | |||
| #define C_ELEMENT(m, n) *(c_offset##n + ((m * v_size) + i)) | |||
| // #undef C_ELEMENT | |||
| // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] | |||
| #define PACK_ELEMENT_K(n, offset_k) packed_b[(k + offset_k) * 4 + n] | |||
| #define PACK_ELEMENT(n) PACK_ELEMENT_K(n, 0) | |||
| // ASIMD | |||
| #define DECLARE_RESULT_VECTOR2(m, n) \ | |||
| float64x2_t result##m##n = vdupq_n_f64(0.0); | |||
| #define DECLARE_RESULT(m, n) float64_t result##m##n = 0.0; | |||
| #define BROADCAST_LOAD_A2(m, offset_k) \ | |||
| float64x2_t a##m##_k##offset_k = vld1q_dup_f64(&A_ELEMENT_K(m, offset_k)); | |||
| #define LOAD_A1(m, offset_k) \ | |||
| float64_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); | |||
| #define VECTOR_LOAD_B2(n, offset_k) \ | |||
| float64x2_t b##n##_k##offset_k = vld1q_f64(&B_ELEMENT_K(n, offset_k)); | |||
| #define GATHER_LOAD_B2(n, offset_k) \ | |||
| float64x2_t b##n##_k##offset_k = vdupq_n_f64(B_ELEMENT_K(n, offset_k)); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f64(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); | |||
| #define UPDATE_RESULT_VECTOR2(m, n, offset_k) \ | |||
| result##m##n = \ | |||
| vfmaq_f64(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); | |||
| #define UPDATE_RESULT(m, n, offset_k) \ | |||
| result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; | |||
| #ifdef B0 | |||
| #define SCATTER_STORE2(m, n) \ | |||
| result##m##n = vmulq_f64(result##m##n, vdupq_n_f64(alpha)); \ | |||
| C_ELEMENT(m, n + 0) = vgetq_lane_f64(result##m##n, 0); \ | |||
| C_ELEMENT(m, n + 1) = vgetq_lane_f64(result##m##n, 1); | |||
| #else | |||
| #define SCATTER_STORE2(m, n) \ | |||
| result##m##n = vmulq_f64(result##m##n, vdupq_n_f64(alpha)); \ | |||
| C_ELEMENT(m, n + 0) = \ | |||
| C_ELEMENT(m, n + 0) * beta + vgetq_lane_f64(result##m##n, 0); \ | |||
| C_ELEMENT(m, n + 1) = \ | |||
| C_ELEMENT(m, n + 1) * beta + vgetq_lane_f64(result##m##n, 1); | |||
| #endif | |||
| // SVE | |||
| #define DECLARE_RESULT_VECTOR(m, n) svfloat64_t result##m##n = svdup_f64(0.0); | |||
| #define BROADCAST_LOAD_A(m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = svdup_f64(A_ELEMENT_K(m, offset_k)); | |||
| #define BROADCAST_LOAD_B(n, offset_k) \ | |||
| svfloat64_t b##s##n##_k##offset_k = svdup_f64(B_ELEMENT_K(n, offset_k)); | |||
| #define VECTOR_LOAD_A(pg, m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); | |||
| #define QUADWORD_LOAD_B(n, offset_k) \ | |||
| svfloat64_t b##s##n##_k##offset_k = \ | |||
| svld1rq(pg_true, &B_ELEMENT_K(n, offset_k)); | |||
| #define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); | |||
| #define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ | |||
| result##m##n = svmla_lane( \ | |||
| result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); | |||
| #ifdef B0 | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #else | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = svmla_m(pg, \ | |||
| result##m##n, \ | |||
| svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ | |||
| beta_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #endif | |||
| #ifndef LIKELY | |||
| #ifdef __GNUC__ | |||
| #define LIKELY(x) __builtin_expect(!!(x), 1) | |||
| #else | |||
| #define LIKELY(x) (x) | |||
| #endif | |||
| #endif | |||
| #ifdef B0 | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #else | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT beta, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #endif | |||
| { | |||
| const uint64_t v_size = svcntd(); | |||
| const uint64_t v_size2 = v_size * 2; | |||
| const svbool_t pg_true = svptrue_b64(); | |||
| const svbool_t pg_quad = svwhilelt_b64(0, 2); | |||
| const svfloat64_t alpha_vec = svdup_f64(alpha); | |||
| #ifndef B0 | |||
| const svfloat64_t beta_vec = svdup_f64(beta); | |||
| #endif | |||
| const BLASLONG n4 = N & -4; | |||
| const BLASLONG n2 = N & -2; | |||
| const BLASLONG v_m2 = M & -v_size2; | |||
| const BLASLONG v_m1 = M & -v_size; | |||
| FLOAT* b_offset = B; | |||
| FLOAT* a_offset = A; | |||
| FLOAT* c_offset = C; | |||
| BLASLONG j = 0; | |||
| for (; j < n4; j += 4) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_C_POINTER(1, 1); | |||
| CREATE_C_POINTER(2, 2); | |||
| CREATE_C_POINTER(3, 3); | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| UPDATE_A_POINTER(v_size2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| QUADWORD_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| VECTOR_STORE(pg_true, 1, 2); | |||
| VECTOR_STORE(pg_true, 1, 3); | |||
| INCR_C_POINTER(0, v_size2); | |||
| INCR_C_POINTER(1, v_size2); | |||
| INCR_C_POINTER(2, v_size2); | |||
| INCR_C_POINTER(3, v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(v_size); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| QUADWORD_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| INCR_C_POINTER(0, v_size); | |||
| INCR_C_POINTER(1, v_size); | |||
| INCR_C_POINTER(2, v_size); | |||
| INCR_C_POINTER(3, v_size); | |||
| } | |||
| for (; i < M; i += v_size) { | |||
| const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(0); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| QUADWORD_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| VECTOR_STORE(pg_tail, 0, 1); | |||
| VECTOR_STORE(pg_tail, 0, 2); | |||
| VECTOR_STORE(pg_tail, 0, 3); | |||
| INCR_C_POINTER(0, 0); | |||
| INCR_C_POINTER(1, 0); | |||
| INCR_C_POINTER(2, 0); | |||
| INCR_C_POINTER(3, 0); | |||
| } | |||
| UPDATE_B_POINTER(4); | |||
| RESET_A_POINTER(); | |||
| UPDATE_C_POINTER(4); | |||
| } | |||
| for (; j < n2; j += 2) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_C_POINTER(1, 1); | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| UPDATE_A_POINTER(v_size2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| INCR_C_POINTER(0, v_size2); | |||
| INCR_C_POINTER(1, v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(v_size); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| INCR_C_POINTER(0, v_size); | |||
| INCR_C_POINTER(1, v_size); | |||
| } | |||
| for (; i < M; i += v_size) { | |||
| const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(0); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| VECTOR_STORE(pg_tail, 0, 1); | |||
| INCR_C_POINTER(0, 0); | |||
| INCR_C_POINTER(1, 0); | |||
| } | |||
| UPDATE_B_POINTER(2); | |||
| RESET_A_POINTER(); | |||
| UPDATE_C_POINTER(2); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_B_POINTER(0, 0); | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| UPDATE_A_POINTER(v_size2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| INCR_C_POINTER(0, v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(v_size); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| INCR_C_POINTER(0, v_size); | |||
| } | |||
| for (; i < M; i += v_size) { | |||
| const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(0); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| INCR_C_POINTER(0, 0); | |||
| } | |||
| UPDATE_B_POINTER(1); | |||
| RESET_A_POINTER(); | |||
| UPDATE_C_POINTER(1); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,571 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #include <arm_sve.h> | |||
| #if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ | |||
| __has_include(<arm_neon_sve_bridge.h>) | |||
| #include <arm_neon_sve_bridge.h> | |||
| #else | |||
| #define svdup_neonq_f32(fixed_reg) \ | |||
| ({ \ | |||
| svfloat32_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #define svdup_neonq_f64(fixed_reg) \ | |||
| ({ \ | |||
| svfloat64_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #endif | |||
| #define RESET_A_POINTER() a_offset = A; | |||
| #define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale * lda; | |||
| #define UPDATE_A_POINTER(scale) a_offset = a_offset + scale * lda; | |||
| #define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k)) | |||
| #define A_ELEMENT(m) A_ELEMENT_K(m, 0) | |||
| #define RESET_B_POINTER() b_offset = B; | |||
| #define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale * ldb; | |||
| #define UPDATE_B_POINTER(scale) b_offset = b_offset + scale * ldb; | |||
| #define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k)) | |||
| #define B_ELEMENT(n) B_ELEMENT_K(n, 0) | |||
| #define CREATE_C_POINTER(m, scale) FLOAT* c_offset##m = c_offset + scale; | |||
| #define INCR_C_POINTER(m, incr) // c_offset ## m += incr * ldc; | |||
| #define UPDATE_C_POINTER(scale) c_offset += scale; | |||
| #define C_ELEMENT(m, n) \ | |||
| *(c_offset##m + ((j + n) * ldc)) // C[(i+(m))+(j+(n))*ldc] | |||
| // #undef C_ELEMENT | |||
| // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] | |||
| #define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m] | |||
| #define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) | |||
| // ASIMD | |||
| #define DECLARE_RESULT_VECTOR2(m, n) \ | |||
| float64x2_t result##m##n = vdupq_n_f64(0.0); | |||
| #define DECLARE_RESULT(m, n) float64_t result##m##n = 0.0; | |||
| #define BROADCAST_LOAD_A2(m, offset_k) \ | |||
| float64x2_t a##m##_k##offset_k = vld1q_dup_f64(&A_ELEMENT_K(m, offset_k)); | |||
| #define LOAD_A1(m, offset_k) \ | |||
| float64_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); | |||
| #define GATHER_LOAD_B2(n, offset_k) \ | |||
| float64x2_t b##n##_k##offset_k = vdupq_n_f64(B_ELEMENT_K(n, offset_k)); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f64(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); | |||
| #define VECTOR_UNPACK_B2(n, offset_k) \ | |||
| float64x2_t b##n##_k##offset_k = vld1q_f64(&PACK_ELEMENT_K(n, offset_k)); | |||
| #define PACK_B0(n, offset_k) \ | |||
| PACK_ELEMENT_K(n, offset_k) = vget_lane_f64(b##n##_k##offset_k, 0); | |||
| #define UPDATE_RESULT_VECTOR2(m, n, offset_k) \ | |||
| result##m##n = \ | |||
| vfmaq_f64(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); | |||
| #define UPDATE_RESULT(m, n, offset_k) \ | |||
| result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; | |||
| #ifdef B0 | |||
| #define SCATTER_STORE2(m, n) \ | |||
| result##m##n = vmulq_f64(result##m##n, vdupq_n_f64(alpha)); \ | |||
| C_ELEMENT(m, n + 0) = vgetq_lane_f64(result##m##n, 0); \ | |||
| C_ELEMENT(m, n + 1) = vgetq_lane_f64(result##m##n, 1); | |||
| #else | |||
| #define SCATTER_STORE2(m, n) \ | |||
| result##m##n = vmulq_f64(result##m##n, vdupq_n_f64(alpha)); \ | |||
| C_ELEMENT(m, n + 0) = \ | |||
| C_ELEMENT(m, n + 0) * beta + vgetq_lane_f64(result##m##n, 0); \ | |||
| C_ELEMENT(m, n + 1) = \ | |||
| C_ELEMENT(m, n + 1) * beta + vgetq_lane_f64(result##m##n, 1); | |||
| #endif | |||
| // SVE | |||
| #define DECLARE_RESULT_VECTOR(m, n) svfloat64_t result##m##n = svdup_f64(0.0); | |||
| #define BROADCAST_LOAD_A(m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = svdup_f64(A_ELEMENT_K(m, offset_k)); | |||
| #define BROADCAST_LOAD_B(n, offset_k) \ | |||
| svfloat64_t b##s##n##_k##offset_k = svdup_f64(B_ELEMENT_K(n, offset_k)); | |||
| #define VECTOR_LOAD_A(pg, m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); | |||
| #define GATHER_LOAD_A(pg, m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = \ | |||
| svld1_gather_index(pg, &A_ELEMENT_K(m, offset_k), lda_vec); | |||
| #define PACK_A(m, offset_k) \ | |||
| svst1(pg_first, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); | |||
| #define VECTOR_PACK_A(m, offset_k) \ | |||
| svst1(pg_true, &PACK_ELEMENT_K(m* v_size, offset_k), a##s##m##_k##offset_k); | |||
| #define QUADWORD_PACK_A(m, offset_k) \ | |||
| svst1(pg_quad, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); | |||
| #define UNPACK_VECTOR_A(m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = \ | |||
| svld1(pg_true, &PACK_ELEMENT_K(m * v_size, offset_k)); | |||
| #define UNPACK_BROADCAST_A(m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = svdup_f64(PACK_ELEMENT_K(m, offset_k)); | |||
| #define UNPACK_QUADWORD_A(m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = \ | |||
| svld1rq(pg_true, &PACK_ELEMENT_K(m, offset_k)); | |||
| #define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); | |||
| #define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ | |||
| result##m##n = svmla_lane( \ | |||
| result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); | |||
| #ifdef B0 | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #else | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = svmla_m(pg, \ | |||
| result##m##n, \ | |||
| svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ | |||
| beta_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #endif | |||
| #ifndef LIKELY | |||
| #ifdef __GNUC__ | |||
| #define LIKELY(x) __builtin_expect(!!(x), 1) | |||
| #else | |||
| #define LIKELY(x) (x) | |||
| #endif | |||
| #endif | |||
| #ifdef B0 | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #else | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT beta, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #endif | |||
| { | |||
| const uint64_t v_size = svcntd(); | |||
| const uint64_t v_size2 = v_size * 2; | |||
| const svbool_t pg_true = svptrue_b64(); | |||
| const svbool_t pg_quad = svwhilelt_b64(0, 2); | |||
| const svbool_t pg_first = svwhilelt_b64(0, 1); | |||
| const svfloat64_t alpha_vec = svdup_f64(alpha); | |||
| #ifndef B0 | |||
| const svfloat64_t beta_vec = svdup_f64(beta); | |||
| #endif | |||
| const svuint64_t lda_vec = svindex_u64(0LL, lda); | |||
| const BLASLONG v_m2 = M & -v_size2; | |||
| const BLASLONG v_m1 = M & -v_size; | |||
| const BLASLONG n4 = N & -4; | |||
| const BLASLONG n2 = N & -2; | |||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
| FLOAT* packed_a = | |||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
| FLOAT* a_offset = A; | |||
| FLOAT* b_offset = B; | |||
| FLOAT* c_offset = C; | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_C_POINTER(1, v_size); | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| BLASLONG j = 0; | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| if (j == 0) { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| VECTOR_PACK_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| VECTOR_PACK_A(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| } | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| VECTOR_STORE(pg_true, 1, 2); | |||
| VECTOR_STORE(pg_true, 1, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| INCR_C_POINTER(1, 4); | |||
| } | |||
| for (; j < n2; j += 2) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| UPDATE_B_POINTER(2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| INCR_C_POINTER(0, 2); | |||
| INCR_C_POINTER(1, 2); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_B_POINTER(0, 0); | |||
| UPDATE_B_POINTER(1); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| INCR_C_POINTER(1, 1); | |||
| } | |||
| UPDATE_A_POINTER(v_size2); | |||
| RESET_B_POINTER(); | |||
| UPDATE_C_POINTER(v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_A_POINTER(0, 0); | |||
| BLASLONG j = 0; | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| } | |||
| for (; j < n2; j += 2) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| UPDATE_B_POINTER(2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| INCR_C_POINTER(0, 2); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_B_POINTER(0, 0); | |||
| UPDATE_B_POINTER(1); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| } | |||
| UPDATE_A_POINTER(v_size); | |||
| RESET_B_POINTER(); | |||
| UPDATE_C_POINTER(v_size); | |||
| } | |||
| for (; i < M; i += v_size) { | |||
| const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_A_POINTER(0, 0); | |||
| BLASLONG j = 0; | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 3, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| VECTOR_STORE(pg_tail, 0, 1); | |||
| VECTOR_STORE(pg_tail, 0, 2); | |||
| VECTOR_STORE(pg_tail, 0, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| } | |||
| for (; j < n2; j += 2) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| UPDATE_B_POINTER(2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| VECTOR_STORE(pg_tail, 0, 1); | |||
| INCR_C_POINTER(0, 2); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_B_POINTER(0, 0); | |||
| UPDATE_B_POINTER(1); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| } | |||
| UPDATE_A_POINTER(0); | |||
| RESET_B_POINTER(); | |||
| UPDATE_C_POINTER(0); | |||
| } | |||
| if (pack_a) | |||
| free(packed_a); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,564 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #include <arm_sve.h> | |||
| #if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ | |||
| __has_include(<arm_neon_sve_bridge.h>) | |||
| #include <arm_neon_sve_bridge.h> | |||
| #else | |||
| #define svdup_neonq_f32(fixed_reg) \ | |||
| ({ \ | |||
| svfloat32_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #define svdup_neonq_f64(fixed_reg) \ | |||
| ({ \ | |||
| svfloat64_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #endif | |||
| #define RESET_A_POINTER() a_offset = A; | |||
| #define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale * lda; | |||
| #define UPDATE_A_POINTER(scale) a_offset = a_offset + scale * lda; | |||
| #define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k)) | |||
| #define A_ELEMENT(m) A_ELEMENT_K(m, 0) | |||
| #define RESET_B_POINTER() b_offset = B; | |||
| #define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale; | |||
| #define UPDATE_B_POINTER(scale) b_offset = b_offset + scale; | |||
| #define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k) * ldb) | |||
| #define B_ELEMENT(n) B_ELEMENT_K(n, 0) | |||
| #define CREATE_C_POINTER(m, scale) FLOAT* c_offset##m = c_offset + scale; | |||
| #define INCR_C_POINTER(m, incr) // c_offset ## m += incr * ldc; | |||
| #define UPDATE_C_POINTER(scale) c_offset += scale; | |||
| #define C_ELEMENT(m, n) \ | |||
| *(c_offset##m + ((j + n) * ldc)) // C[(i+(m))+(j+(n))*ldc] | |||
| // #undef C_ELEMENT | |||
| // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] | |||
| #define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m] | |||
| #define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) | |||
| // ASIMD | |||
| #define DECLARE_RESULT_VECTOR2(m, n) \ | |||
| float64x2_t result##m##n = vdupq_n_f64(0.0); | |||
| #define DECLARE_RESULT(m, n) float64_t result##m##n = 0.0; | |||
| #define BROADCAST_LOAD_A2(m, offset_k) \ | |||
| float64x2_t a##m##_k##offset_k = vld1q_dup_f64(&A_ELEMENT_K(m, offset_k)); | |||
| #define LOAD_A1(m, offset_k) \ | |||
| float64_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); | |||
| #define VECTOR_LOAD_B2(n, offset_k) \ | |||
| float64x2_t b##n##_k##offset_k = vld1q_f64(&B_ELEMENT_K(n, offset_k)); | |||
| #define GATHER_LOAD_B2(n, offset_k) \ | |||
| float64x2_t b##n##_k##offset_k = vdupq_n_f64(B_ELEMENT_K(n, offset_k)); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f64(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); | |||
| #define VECTOR_UNPACK_B2(n, offset_k) \ | |||
| float64x2_t b##n##_k##offset_k = vld1q_f64(&PACK_ELEMENT_K(n, offset_k)); | |||
| #define VECTOR_PACK_B2(n, offset_k) \ | |||
| vst1q_f64(&PACK_ELEMENT_K(n, offset_k), b##n##_k##offset_k); | |||
| #define PACK_B0(n, offset_k) \ | |||
| PACK_ELEMENT_K(n, offset_k) = vget_lane_f64(b##n##_k##offset_k, 0); | |||
| #define UPDATE_RESULT_VECTOR2(m, n, offset_k) \ | |||
| result##m##n = \ | |||
| vfmaq_f64(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); | |||
| #define UPDATE_RESULT(m, n, offset_k) \ | |||
| result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; | |||
| #ifdef B0 | |||
| #define VECTOR_STORE2(m, n) \ | |||
| vst1q_f64(&C_ELEMENT(m, n), vmulq_f64(result##m##n, vdupq_n_f64(alpha))); | |||
| #define STORE(m, n) C_ELEMENT(m, n) = alpha * result##m##n; | |||
| #else | |||
| #define VECTOR_STORE2(m, n) \ | |||
| result##m##n = vmulq_f64(result##m##n, vdupq_n_f64(alpha)); \ | |||
| result##m##n = \ | |||
| vfmaq_f64(result##m##n, vld1q_f64(&C_ELEMENT(m, n)), vdupq_n_f64(beta)); \ | |||
| vst1q_f64(&C_ELEMENT(m, n), result##m##n); | |||
| #define STORE(m, n) \ | |||
| C_ELEMENT(m, n) = C_ELEMENT(m, n) * beta + alpha * result##m##n; | |||
| #endif | |||
| // SVE | |||
| #define DECLARE_RESULT_VECTOR(m, n) svfloat64_t result##m##n = svdup_f64(0.0); | |||
| #define BROADCAST_LOAD_A(m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = svdup_f64(A_ELEMENT_K(m, offset_k)); | |||
| #define BROADCAST_LOAD_B(n, offset_k) \ | |||
| svfloat64_t b##s##n##_k##offset_k = svdup_f64(B_ELEMENT_K(n, offset_k)); | |||
| #define VECTOR_LOAD_A(pg, m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); | |||
| #define QUADWORD_LOAD_B(n, offset_k) \ | |||
| svfloat64_t b##s##n##_k##offset_k = \ | |||
| svld1rq(pg_true, &B_ELEMENT_K(n, offset_k)); | |||
| #define GATHER_LOAD_A(pg, m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = \ | |||
| svld1_gather_index(pg, &A_ELEMENT_K(m, offset_k), lda_vec); | |||
| #define PACK_A(m, offset_k) \ | |||
| svst1(pg_first, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); | |||
| #define VECTOR_PACK_A(m, offset_k) \ | |||
| svst1(pg_true, &PACK_ELEMENT_K(m* v_size, offset_k), a##s##m##_k##offset_k); | |||
| #define QUADWORD_PACK_A(m, offset_k) \ | |||
| svst1(pg_quad, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); | |||
| #define UNPACK_VECTOR_A(m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = \ | |||
| svld1(pg_true, &PACK_ELEMENT_K(m * v_size, offset_k)); | |||
| #define UNPACK_BROADCAST_A(m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = svdup_f64(PACK_ELEMENT_K(m, offset_k)); | |||
| #define UNPACK_QUADWORD_A(m, offset_k) \ | |||
| svfloat64_t a##s##m##_k##offset_k = \ | |||
| svld1rq(pg_true, &PACK_ELEMENT_K(m, offset_k)); | |||
| #define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); | |||
| #define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ | |||
| result##m##n = svmla_lane( \ | |||
| result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); | |||
| #ifdef B0 | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #else | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = svmla_m(pg, \ | |||
| result##m##n, \ | |||
| svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ | |||
| beta_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #endif | |||
| #ifndef LIKELY | |||
| #ifdef __GNUC__ | |||
| #define LIKELY(x) __builtin_expect(!!(x), 1) | |||
| #else | |||
| #define LIKELY(x) (x) | |||
| #endif | |||
| #endif | |||
| #ifdef B0 | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #else | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT beta, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #endif | |||
| { | |||
| const uint64_t v_size = svcntd(); | |||
| const uint64_t v_size2 = v_size * 2; | |||
| const svbool_t pg_true = svptrue_b64(); | |||
| const svbool_t pg_quad = svwhilelt_b64(0, 2); | |||
| const svbool_t pg_first = svwhilelt_b64(0, 1); | |||
| const svfloat64_t alpha_vec = svdup_f64(alpha); | |||
| #ifndef B0 | |||
| const svfloat64_t beta_vec = svdup_f64(beta); | |||
| #endif | |||
| const svuint64_t lda_vec = svindex_u64(0LL, lda); | |||
| const BLASLONG v_m2 = M & -v_size2; | |||
| const BLASLONG v_m1 = M & -v_size; | |||
| const BLASLONG n4 = N & -4; | |||
| const BLASLONG n2 = N & -2; | |||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
| FLOAT* packed_a = | |||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
| FLOAT* a_offset = A; | |||
| FLOAT* b_offset = B; | |||
| FLOAT* c_offset = C; | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_C_POINTER(1, v_size); | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| BLASLONG j = 0; | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| if (j == 0) { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| VECTOR_PACK_A(0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| QUADWORD_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| VECTOR_PACK_A(1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| QUADWORD_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0); | |||
| } | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| QUADWORD_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 2, 1, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| VECTOR_STORE(pg_true, 1, 2); | |||
| VECTOR_STORE(pg_true, 1, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| INCR_C_POINTER(1, 4); | |||
| } | |||
| for (; j < n2; j += 2) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| UPDATE_B_POINTER(2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| INCR_C_POINTER(0, 2); | |||
| INCR_C_POINTER(1, 2); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_B_POINTER(0, 0); | |||
| UPDATE_B_POINTER(1); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| INCR_C_POINTER(1, 1); | |||
| } | |||
| UPDATE_A_POINTER(v_size2); | |||
| RESET_B_POINTER(); | |||
| UPDATE_C_POINTER(v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_A_POINTER(0, 0); | |||
| BLASLONG j = 0; | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| QUADWORD_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| } | |||
| for (; j < n2; j += 2) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| UPDATE_B_POINTER(2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| INCR_C_POINTER(0, 2); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_B_POINTER(0, 0); | |||
| UPDATE_B_POINTER(1); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| } | |||
| UPDATE_A_POINTER(v_size); | |||
| RESET_B_POINTER(); | |||
| UPDATE_C_POINTER(v_size); | |||
| } | |||
| for (; i < M; i += v_size) { | |||
| const svbool_t pg_tail = svwhilelt_b64((uint64_t)i, (uint64_t)(M)); | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_A_POINTER(0, 0); | |||
| BLASLONG j = 0; | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| QUADWORD_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 2, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 2, 1, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| VECTOR_STORE(pg_tail, 0, 1); | |||
| VECTOR_STORE(pg_tail, 0, 2); | |||
| VECTOR_STORE(pg_tail, 0, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| } | |||
| for (; j < n2; j += 2) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| UPDATE_B_POINTER(2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| VECTOR_STORE(pg_tail, 0, 1); | |||
| INCR_C_POINTER(0, 2); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_B_POINTER(0, 0); | |||
| UPDATE_B_POINTER(1); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| } | |||
| UPDATE_A_POINTER(0); | |||
| RESET_B_POINTER(); | |||
| UPDATE_C_POINTER(0); | |||
| } | |||
| if (pack_a) | |||
| free(packed_a); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,43 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) | |||
| { | |||
| BLASLONG MNK = M * N * K; | |||
| #if defined(DOUBLE) // dgemm | |||
| if (MNK <= 64*64*64) | |||
| return 1; | |||
| #else // sgemm | |||
| if (MNK <= 64*64*64) | |||
| return 1; | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,92 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define SV_COUNT svcntd | |||
| #define SV_TYPE svfloat64_t | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64_s64 | |||
| #define SV_DUP svdup_f64 | |||
| #else | |||
| #define SV_COUNT svcntw | |||
| #define SV_TYPE svfloat32_t | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32_s64 | |||
| #define SV_DUP svdup_f32 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT temp; | |||
| ix = 0; | |||
| a_ptr = a; | |||
| if (inc_y == 1) { | |||
| uint64_t sve_size = SV_COUNT(); | |||
| for (j = 0; j < n; j++) { | |||
| SV_TYPE temp_vec = SV_DUP(alpha * x[ix]); | |||
| i = 0; | |||
| svbool_t pg = SV_WHILE(i, m); | |||
| while (svptest_any(SV_TRUE(), pg)) { | |||
| SV_TYPE a_vec = svld1(pg, a_ptr + i); | |||
| SV_TYPE y_vec = svld1(pg, y + i); | |||
| y_vec = svmla_x(pg, y_vec, temp_vec, a_vec); | |||
| svst1(pg, y + i, y_vec); | |||
| i += sve_size; | |||
| pg = SV_WHILE(i, m); | |||
| } | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return(0); | |||
| } | |||
| for (j = 0; j < n; j++) { | |||
| temp = alpha * x[ix]; | |||
| iy = 0; | |||
| for (i = 0; i < m; i++) { | |||
| y[iy] += temp * a_ptr[i]; | |||
| iy += inc_y; | |||
| } | |||
| a_ptr += lda; | |||
| ix += inc_x; | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -1,5 +1,5 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2015, The OpenBLAS Project | |||
| Copyright (c) 2015, 2024 The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| @@ -170,39 +170,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL_F32_FINALIZE | |||
| #if !defined(DOUBLE) | |||
| fadd v1.4s, v1.4s, v2.4s | |||
| // F8 only has 2 accumulators | |||
| // so add into those pairs | |||
| fadd v1.4s, v1.4s, v3.4s | |||
| fadd v1.4s, v1.4s, v4.4s | |||
| #else | |||
| fadd v1.2d, v1.2d, v2.2d | |||
| fadd v1.2d, v1.2d, v3.2d | |||
| fadd v1.2d, v1.2d, v4.2d | |||
| fadd v2.4s, v2.4s, v4.4s | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F4 | |||
| .macro KERNEL_F8 | |||
| #if !defined(DOUBLE) | |||
| ld1 {v2.4s}, [A_PTR], #16 | |||
| ld1 {v3.4s}, [X_PTR], #16 | |||
| fmla v1.4s, v2.4s, v3.4s | |||
| #else | |||
| ld1 {v2.2d}, [A_PTR], #16 | |||
| ld1 {v3.2d}, [X_PTR], #16 | |||
| fmla v1.2d, v2.2d, v3.2d | |||
| ld1 {v4.2d}, [A_PTR], #16 | |||
| ld1 {v5.2d}, [X_PTR], #16 | |||
| fmla v1.2d, v4.2d, v5.2d | |||
| ld1 {v13.4s, v14.4s}, [A_PTR], #32 | |||
| ld1 {v17.4s, v18.4s}, [X_PTR], #32 | |||
| fmla v1.4s, v13.4s, v17.4s | |||
| fmla v2.4s, v14.4s, v18.4s | |||
| #else | |||
| ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 | |||
| ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 | |||
| fmla v1.2d, v13.2d, v17.2d | |||
| fmla v2.2d, v14.2d, v18.2d | |||
| fmla v3.2d, v15.2d, v19.2d | |||
| fmla v4.2d, v16.2d, v20.2d | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F4_FINALIZE | |||
| .macro KERNEL_F8_FINALIZE | |||
| #if !defined(DOUBLE) | |||
| ext v2.16b, v1.16b, v1.16b, #8 | |||
| // Take the top two elements of v1 and | |||
| // put them into the first two lanes of v3 | |||
| ext v3.16b, v1.16b, v1.16b, #8 | |||
| fadd v1.2s, v1.2s, v3.2s | |||
| ext v4.16b, v2.16b, v2.16b, #8 | |||
| fadd v2.2s, v2.2s, v4.2s | |||
| // Final pair | |||
| fadd v1.2s, v1.2s, v2.2s | |||
| faddp TEMP, v1.2s | |||
| #else | |||
| faddp TEMP, v1.2d | |||
| faddp TEMP1, v2.2d | |||
| faddp TEMP2, v3.2d | |||
| faddp TEMP3, v4.2d | |||
| fadd TEMP, TEMP, TEMP1 | |||
| fadd TEMP2, TEMP2, TEMP3 | |||
| fadd TEMP, TEMP, TEMP2 | |||
| #endif | |||
| .endm | |||
| @@ -258,7 +267,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| asr I, M, #5 | |||
| cmp I, xzr | |||
| beq .Lgemv_t_kernel_F4 | |||
| beq .Lgemv_t_kernel_F8 | |||
| .Lgemv_t_kernel_F320: | |||
| @@ -269,24 +278,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| KERNEL_F32_FINALIZE | |||
| .Lgemv_t_kernel_F4: | |||
| .Lgemv_t_kernel_F8: | |||
| ands I, M, #31 | |||
| asr I, I, #2 | |||
| asr I, I, #3 | |||
| cmp I, xzr | |||
| beq .Lgemv_t_kernel_F1 | |||
| .Lgemv_t_kernel_F40: | |||
| .Lgemv_t_kernel_F80: | |||
| KERNEL_F4 | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne .Lgemv_t_kernel_F40 | |||
| bne .Lgemv_t_kernel_F80 | |||
| .Lgemv_t_kernel_F1: | |||
| KERNEL_F4_FINALIZE | |||
| KERNEL_F8_FINALIZE | |||
| ands I, M, #3 | |||
| ands I, M, #7 | |||
| ble .Lgemv_t_kernel_F_END | |||
| .Lgemv_t_kernel_F10: | |||
| @@ -0,0 +1,120 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define SV_COUNT svcntd | |||
| #define SV_TYPE svfloat64_t | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64_s64 | |||
| #define SV_DUP svdup_f64 | |||
| #else | |||
| #define SV_COUNT svcntw | |||
| #define SV_TYPE svfloat32_t | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32_s64 | |||
| #define SV_DUP svdup_f32 | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT temp; | |||
| iy = 0; | |||
| a_ptr = a; | |||
| if (inc_x == 1) { | |||
| svbool_t pg_true = SV_TRUE(); | |||
| uint64_t sve_size = SV_COUNT(); | |||
| uint64_t sve_size2 = sve_size * 2; | |||
| BLASLONG m1 = m & -sve_size; | |||
| BLASLONG m2 = m & -sve_size2; | |||
| for (j = 0; j < n; j++) { | |||
| BLASLONG i = 0; | |||
| SV_TYPE temp_vec_v2_0 = SV_DUP(0.0); | |||
| SV_TYPE temp_vec_v2_1 = SV_DUP(0.0); | |||
| for (; i < m2; i += sve_size2) { | |||
| SV_TYPE a_vec0 = svld1(pg_true, a_ptr + i); | |||
| SV_TYPE x_vec0 = svld1(pg_true, x + i); | |||
| SV_TYPE a_vec1 = svld1(pg_true, a_ptr + i + sve_size); | |||
| SV_TYPE x_vec1 = svld1(pg_true, x + i + sve_size); | |||
| temp_vec_v2_0 = svmla_m(pg_true, temp_vec_v2_0, a_vec0, x_vec0); | |||
| temp_vec_v2_1 = svmla_m(pg_true, temp_vec_v2_1, a_vec1, x_vec1); | |||
| } | |||
| SV_TYPE temp_vec_v1 = SV_DUP(0.0); | |||
| for (; i < m1; i += sve_size) { | |||
| SV_TYPE a_vec0 = svld1(pg_true, a_ptr + i); | |||
| SV_TYPE x_vec0 = svld1(pg_true, x + i); | |||
| temp_vec_v1 = svmla_m(pg_true, temp_vec_v1, a_vec0, x_vec0); | |||
| } | |||
| SV_TYPE temp_vec = SV_DUP(0.0); | |||
| for (; i < m; i += sve_size) { | |||
| svbool_t pg = SV_WHILE(i, m); | |||
| SV_TYPE a_vec = svld1(pg, a_ptr + i); | |||
| SV_TYPE x_vec = svld1(pg, x + i); | |||
| temp_vec = svmla_m(pg, temp_vec, a_vec, x_vec); | |||
| } | |||
| y[iy] += alpha * ( | |||
| (svaddv(SV_TRUE(), temp_vec_v2_0) + svaddv(SV_TRUE(), temp_vec)) + | |||
| (svaddv(SV_TRUE(), temp_vec_v2_1) + svaddv(SV_TRUE(), temp_vec_v1)) | |||
| ); | |||
| iy += inc_y; | |||
| a_ptr += lda; | |||
| } | |||
| return(0); | |||
| } | |||
| for (j = 0; j < n; j++) { | |||
| temp = 0.0; | |||
| ix = 0; | |||
| for (i = 0; i < m; i++) { | |||
| temp += a_ptr[i] * x[ix]; | |||
| ix += inc_x; | |||
| } | |||
| y[iy] += alpha * temp; | |||
| iy += inc_y; | |||
| a_ptr += lda; | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define X_COPY x5 /* X vector address */ | |||
| #define INC_X x4 /* X stride */ | |||
| #define I x1 /* loop variable */ | |||
| #define FLAG x9 | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| @@ -168,9 +168,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp N, xzr | |||
| ble .Lscal_kernel_L999 | |||
| //fcmp DA, #0.0 | |||
| //beq .Lscal_kernel_zero | |||
| ldr FLAG, [sp] | |||
| cmp FLAG, #1 | |||
| beq .Lscal_kernel_nansafe | |||
| fcmp DA, #0.0 | |||
| beq .Lscal_kernel_zero | |||
| .Lscal_kernel_nansafe: | |||
| cmp INC_X, #1 | |||
| bne .Lscal_kernel_S_BEGIN | |||
| @@ -0,0 +1,687 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #include <arm_sve.h> | |||
| #if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ | |||
| __has_include(<arm_neon_sve_bridge.h>) | |||
| #include <arm_neon_sve_bridge.h> | |||
| #else | |||
| #define svdup_neonq_f32(fixed_reg) \ | |||
| ({ \ | |||
| svfloat32_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #define svdup_neonq_f64(fixed_reg) \ | |||
| ({ \ | |||
| svfloat64_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #endif | |||
| #define RESET_A_POINTER() a_offset = A; | |||
| #define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale; | |||
| #define UPDATE_A_POINTER(scale) a_offset = a_offset + scale; | |||
| #define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k) * lda) | |||
| #define A_ELEMENT(m) A_ELEMENT_K(m, 0) | |||
| #define RESET_B_POINTER() b_offset = B; | |||
| #define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale * ldb; | |||
| #define UPDATE_B_POINTER(scale) b_offset = b_offset + scale * ldb; | |||
| #define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k)) | |||
| #define B_ELEMENT(n) B_ELEMENT_K(n, 0) | |||
| #define CREATE_C_POINTER(n, scale) FLOAT* c_offset##n = c_offset + scale * ldc; | |||
| #define INCR_C_POINTER(m, incr) // c_offset ## m += incr; | |||
| #define UPDATE_C_POINTER(scale) c_offset = c_offset + scale * ldc; | |||
| #define C_ELEMENT(m, n) *(c_offset##n + ((m * v_size) + i)) | |||
| // #undef C_ELEMENT | |||
| // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] | |||
| #define PACK_ELEMENT_K(n, offset_k) packed_b[(k + offset_k) * 4 + n] | |||
| #define PACK_ELEMENT(n) PACK_ELEMENT_K(n, 0) | |||
| // ASIMD | |||
| #define DECLARE_RESULT_VECTOR4(m, n) \ | |||
| float32x4_t result##m##n = vdupq_n_f32(0.0); | |||
| #define DECLARE_RESULT(m, n) float32_t result##m##n = 0.0; | |||
| #define BROADCAST_LOAD_A4(m, offset_k) \ | |||
| float32x4_t a##m##_k##offset_k = vld1q_dup_f32(&A_ELEMENT_K(m, offset_k)); | |||
| #define LOAD_A1(m, offset_k) \ | |||
| float32_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); | |||
| #define VECTOR_LOAD_B_K4(n, offset_k) \ | |||
| float32x4_t b##k##n##_k##offset_k = vld1q_f32(&B_ELEMENT_K(n, offset_k)); | |||
| #define TRANSPOSE_B4_K4( \ | |||
| n0, n1, n2, n3, offset_k0, offset_k1, offset_k2, offset_k3) \ | |||
| float32x4_t b##t##n0##_k##offset_k0 = \ | |||
| vzip1q_f32(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0); \ | |||
| float32x4_t b##t##n0##_k##offset_k1 = \ | |||
| vzip2q_f32(b##k##n0##_k##offset_k0, b##k##n1##_k##offset_k0); \ | |||
| float32x4_t b##t##n0##_k##offset_k2 = \ | |||
| vzip1q_f32(b##k##n2##_k##offset_k0, b##k##n3##_k##offset_k0); \ | |||
| float32x4_t b##t##n0##_k##offset_k3 = \ | |||
| vzip2q_f32(b##k##n2##_k##offset_k0, b##k##n3##_k##offset_k0); \ | |||
| float32x4_t b##n0##_k##offset_k0 = vreinterpretq_f32_f64( \ | |||
| vzip1q_f64(vreinterpretq_f64_f32(b##t##n0##_k##offset_k0), \ | |||
| vreinterpretq_f64_f32(b##t##n0##_k##offset_k2))); \ | |||
| float32x4_t b##n0##_k##offset_k1 = vreinterpretq_f32_f64( \ | |||
| vzip2q_f64(vreinterpretq_f64_f32(b##t##n0##_k##offset_k0), \ | |||
| vreinterpretq_f64_f32(b##t##n0##_k##offset_k2))); \ | |||
| float32x4_t b##n0##_k##offset_k2 = vreinterpretq_f32_f64( \ | |||
| vzip1q_f64(vreinterpretq_f64_f32(b##t##n0##_k##offset_k1), \ | |||
| vreinterpretq_f64_f32(b##t##n0##_k##offset_k3))); \ | |||
| float32x4_t b##n0##_k##offset_k3 = vreinterpretq_f32_f64( \ | |||
| vzip2q_f64(vreinterpretq_f64_f32(b##t##n0##_k##offset_k1), \ | |||
| vreinterpretq_f64_f32(b##t##n0##_k##offset_k3))); | |||
| #define SCALE_B4_K4(n0, offset_k0, offset_k1, offset_k2, offset_k3) \ | |||
| svfloat32_t b##s##n0##_k##offset_k0 = svdup_neonq_f32(b##n0##_k##offset_k0); \ | |||
| svfloat32_t b##s##n0##_k##offset_k1 = svdup_neonq_f32(b##n0##_k##offset_k1); \ | |||
| svfloat32_t b##s##n0##_k##offset_k2 = svdup_neonq_f32(b##n0##_k##offset_k2); \ | |||
| svfloat32_t b##s##n0##_k##offset_k3 = svdup_neonq_f32(b##n0##_k##offset_k3); | |||
| #define GATHER_LOAD_B4(n, offset_k) \ | |||
| float32x4_t b##n##_k##offset_k = vdupq_n_f32(B_ELEMENT_K(n, offset_k)); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f32(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f32(B_ELEMENT_K(n + 2, offset_k), b##n##_k##offset_k, 2); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f32(B_ELEMENT_K(n + 3, offset_k), b##n##_k##offset_k, 3); | |||
| #define VECTOR_UNPACK_B4(n, offset_k) \ | |||
| float32x4_t b##n##_k##offset_k = vld1q_f32(&PACK_ELEMENT_K(n, offset_k)); | |||
| #define VECTOR_PACK_B4(n, offset_k) \ | |||
| vst1q_f32(&PACK_ELEMENT_K(n, offset_k), b##n##_k##offset_k); | |||
| #define PACK_B0(n, offset_k) \ | |||
| PACK_ELEMENT_K(n, offset_k) = vget_lane_f32(b##n##_k##offset_k, 0); | |||
| #define UPDATE_RESULT_VECTOR4(m, n, offset_k) \ | |||
| result##m##n = \ | |||
| vfmaq_f32(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); | |||
| #define UPDATE_RESULT(m, n, offset_k) \ | |||
| result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; | |||
| #ifdef B0 | |||
| #define SCATTER_STORE4(m, n) \ | |||
| result##m##n = vmulq_f32(result##m##n, vdupq_n_f32(alpha)); \ | |||
| C_ELEMENT(m, n + 0) = vgetq_lane_f32(result##m##n, 0); \ | |||
| C_ELEMENT(m, n + 1) = vgetq_lane_f32(result##m##n, 1); \ | |||
| C_ELEMENT(m, n + 2) = vgetq_lane_f32(result##m##n, 2); \ | |||
| C_ELEMENT(m, n + 3) = vgetq_lane_f32(result##m##n, 3); | |||
| #else | |||
| #define SCATTER_STORE4(m, n) \ | |||
| result##m##n = vmulq_f32(result##m##n, vdupq_n_f32(alpha)); \ | |||
| C_ELEMENT(m, n + 0) = \ | |||
| C_ELEMENT(m, n + 0) * beta + vgetq_lane_f32(result##m##n, 0); \ | |||
| C_ELEMENT(m, n + 1) = \ | |||
| C_ELEMENT(m, n + 1) * beta + vgetq_lane_f32(result##m##n, 1); \ | |||
| C_ELEMENT(m, n + 2) = \ | |||
| C_ELEMENT(m, n + 2) * beta + vgetq_lane_f32(result##m##n, 2); \ | |||
| C_ELEMENT(m, n + 3) = \ | |||
| C_ELEMENT(m, n + 3) * beta + vgetq_lane_f32(result##m##n, 3); | |||
| #endif | |||
| // SVE | |||
| #define DECLARE_RESULT_VECTOR(m, n) svfloat32_t result##m##n = svdup_f32(0.0); | |||
| #define BROADCAST_LOAD_A(m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = svdup_f32(A_ELEMENT_K(m, offset_k)); | |||
| #define BROADCAST_LOAD_B(n, offset_k) \ | |||
| svfloat32_t b##s##n##_k##offset_k = svdup_f32(B_ELEMENT_K(n, offset_k)); | |||
| #define VECTOR_LOAD_A(pg, m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); | |||
| #define QUADWORD_LOAD_B(n, offset_k) \ | |||
| svfloat32_t b##s##n##_k##offset_k = \ | |||
| svld1rq(pg_true, &B_ELEMENT_K(n, offset_k)); | |||
| #define PACK_B(n, offset_k) \ | |||
| svst1(pg_first, &PACK_ELEMENT_K(n, offset_k), b##s##n##_k##offset_k); | |||
| #define VECTOR_PACK_B(n, offset_k) \ | |||
| svst1(pg_true, &PACK_ELEMENT_K(n* v_size, offset_k), b##s##n##_k##offset_k); | |||
| #define QUADWORD_PACK_B(n, offset_k) \ | |||
| svst1(pg_quad, &PACK_ELEMENT_K(n, offset_k), b##s##n##_k##offset_k); | |||
| #define UNPACK_VECTOR_B(n, offset_k) \ | |||
| svfloat32_t b##s##n##_k##offset_k = \ | |||
| svld1(pg_true, &PACK_ELEMENT_K(n * v_size, offset_k)); | |||
| #define UNPACK_BROADCAST_B(n, offset_k) \ | |||
| svfloat32_t b##s##n##_k##offset_k = svdup_f32(PACK_ELEMENT_K(n, offset_k)); | |||
| #define UNPACK_QUADWORD_B(n, offset_k) \ | |||
| svfloat32_t b##s##n##_k##offset_k = \ | |||
| svld1rq(pg_true, &PACK_ELEMENT_K(n, offset_k)); | |||
| #define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); | |||
| #define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ | |||
| result##m##n = svmla_lane( \ | |||
| result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); | |||
| #ifdef B0 | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #else | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = svmla_m(pg, \ | |||
| result##m##n, \ | |||
| svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ | |||
| beta_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #endif | |||
| #ifndef LIKELY | |||
| #ifdef __GNUC__ | |||
| #define LIKELY(x) __builtin_expect(!!(x), 1) | |||
| #else | |||
| #define LIKELY(x) (x) | |||
| #endif | |||
| #endif | |||
| #ifdef B0 | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #else | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT beta, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #endif | |||
| { | |||
| const uint64_t v_size = svcntw(); | |||
| const uint64_t v_size2 = v_size * 2; | |||
| const svbool_t pg_true = svptrue_b32(); | |||
| const svbool_t pg_quad = svwhilelt_b32(0, 4); | |||
| const svbool_t pg_first = svwhilelt_b32(0, 1); | |||
| const svfloat32_t alpha_vec = svdup_f32(alpha); | |||
| #ifndef B0 | |||
| const svfloat32_t beta_vec = svdup_f32(beta); | |||
| #endif | |||
| const BLASLONG n4 = N & -4; | |||
| const BLASLONG v_m2 = M & -v_size2; | |||
| const BLASLONG v_m1 = M & -v_size; | |||
| const BLASLONG k4 = K & -4; | |||
| const int pack_b = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
| FLOAT* packed_b = | |||
| (pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL; | |||
| FLOAT* b_offset = B; | |||
| FLOAT* a_offset = A; | |||
| FLOAT* c_offset = C; | |||
| BLASLONG j = 0; | |||
| for (; j < n4; j += 4) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_C_POINTER(1, 1); | |||
| CREATE_C_POINTER(2, 2); | |||
| CREATE_C_POINTER(3, 3); | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| UPDATE_A_POINTER(v_size2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| if (LIKELY(packed_b != NULL)) { | |||
| if (i == 0) { | |||
| for (; k < k4; k += 4) { | |||
| VECTOR_LOAD_B_K4(0, 0); | |||
| VECTOR_LOAD_B_K4(1, 0); | |||
| VECTOR_LOAD_B_K4(2, 0); | |||
| VECTOR_LOAD_B_K4(3, 0); | |||
| TRANSPOSE_B4_K4(0, 1, 2, 3, 0, 1, 2, 3); | |||
| SCALE_B4_K4(0, 0, 1, 2, 3); | |||
| VECTOR_PACK_B4(0, 0); | |||
| VECTOR_PACK_B4(0, 1); | |||
| VECTOR_PACK_B4(0, 2); | |||
| VECTOR_PACK_B4(0, 3); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 1); | |||
| VECTOR_LOAD_A(pg_true, 0, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 2); | |||
| VECTOR_LOAD_A(pg_true, 0, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 1); | |||
| VECTOR_LOAD_A(pg_true, 1, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 2); | |||
| VECTOR_LOAD_A(pg_true, 1, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 3); | |||
| } | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| PACK_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| PACK_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| PACK_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| PACK_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| UNPACK_QUADWORD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| } | |||
| } | |||
| } else { | |||
| for (; k < k4; k += 4) { | |||
| VECTOR_LOAD_B_K4(0, 0); | |||
| VECTOR_LOAD_B_K4(1, 0); | |||
| VECTOR_LOAD_B_K4(2, 0); | |||
| VECTOR_LOAD_B_K4(3, 0); | |||
| TRANSPOSE_B4_K4(0, 1, 2, 3, 0, 1, 2, 3); | |||
| SCALE_B4_K4(0, 0, 1, 2, 3); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 1); | |||
| VECTOR_LOAD_A(pg_true, 0, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 2); | |||
| VECTOR_LOAD_A(pg_true, 0, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 1); | |||
| VECTOR_LOAD_A(pg_true, 1, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 2); | |||
| VECTOR_LOAD_A(pg_true, 1, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 3); | |||
| } | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| VECTOR_STORE(pg_true, 1, 2); | |||
| VECTOR_STORE(pg_true, 1, 3); | |||
| INCR_C_POINTER(0, v_size2); | |||
| INCR_C_POINTER(1, v_size2); | |||
| INCR_C_POINTER(2, v_size2); | |||
| INCR_C_POINTER(3, v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(v_size); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| if (LIKELY(packed_b != NULL)) { | |||
| for (; k < K; k++) { | |||
| UNPACK_QUADWORD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < k4; k += 4) { | |||
| VECTOR_LOAD_B_K4(0, 0); | |||
| VECTOR_LOAD_B_K4(1, 0); | |||
| VECTOR_LOAD_B_K4(2, 0); | |||
| VECTOR_LOAD_B_K4(3, 0); | |||
| TRANSPOSE_B4_K4(0, 1, 2, 3, 0, 1, 2, 3); | |||
| SCALE_B4_K4(0, 0, 1, 2, 3); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 1); | |||
| VECTOR_LOAD_A(pg_true, 0, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 2); | |||
| VECTOR_LOAD_A(pg_true, 0, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3); | |||
| } | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| INCR_C_POINTER(0, v_size); | |||
| INCR_C_POINTER(1, v_size); | |||
| INCR_C_POINTER(2, v_size); | |||
| INCR_C_POINTER(3, v_size); | |||
| } | |||
| for (; i < M; i += v_size) { | |||
| const svbool_t pg_tail = svwhilelt_b32((uint32_t)i, (uint32_t)(M)); | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(0); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| if (LIKELY(packed_b != NULL)) { | |||
| for (; k < K; k++) { | |||
| UNPACK_QUADWORD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < k4; k += 4) { | |||
| VECTOR_LOAD_B_K4(0, 0); | |||
| VECTOR_LOAD_B_K4(1, 0); | |||
| VECTOR_LOAD_B_K4(2, 0); | |||
| VECTOR_LOAD_B_K4(3, 0); | |||
| TRANSPOSE_B4_K4(0, 1, 2, 3, 0, 1, 2, 3); | |||
| SCALE_B4_K4(0, 0, 1, 2, 3); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 1); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 1); | |||
| VECTOR_LOAD_A(pg_tail, 0, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 2); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 2); | |||
| VECTOR_LOAD_A(pg_tail, 0, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 3); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 3); | |||
| } | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| VECTOR_STORE(pg_tail, 0, 1); | |||
| VECTOR_STORE(pg_tail, 0, 2); | |||
| VECTOR_STORE(pg_tail, 0, 3); | |||
| INCR_C_POINTER(0, 0); | |||
| INCR_C_POINTER(1, 0); | |||
| INCR_C_POINTER(2, 0); | |||
| INCR_C_POINTER(3, 0); | |||
| } | |||
| UPDATE_B_POINTER(4); | |||
| RESET_A_POINTER(); | |||
| UPDATE_C_POINTER(4); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_B_POINTER(0, 0); | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| UPDATE_A_POINTER(v_size2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| INCR_C_POINTER(0, v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(v_size); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| INCR_C_POINTER(0, v_size); | |||
| } | |||
| for (; i < M; i += v_size) { | |||
| const svbool_t pg_tail = svwhilelt_b32((uint32_t)i, (uint32_t)(M)); | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(0); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| INCR_C_POINTER(0, 0); | |||
| } | |||
| UPDATE_B_POINTER(1); | |||
| RESET_A_POINTER(); | |||
| UPDATE_C_POINTER(1); | |||
| } | |||
| if (pack_b) | |||
| free(packed_b); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,483 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #include <arm_sve.h> | |||
| #if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ | |||
| __has_include(<arm_neon_sve_bridge.h>) | |||
| #include <arm_neon_sve_bridge.h> | |||
| #else | |||
| #define svdup_neonq_f32(fixed_reg) \ | |||
| ({ \ | |||
| svfloat32_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #define svdup_neonq_f64(fixed_reg) \ | |||
| ({ \ | |||
| svfloat64_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #endif | |||
| #define RESET_A_POINTER() a_offset = A; | |||
| #define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale; | |||
| #define UPDATE_A_POINTER(scale) a_offset = a_offset + scale; | |||
| #define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k) * lda) | |||
| #define A_ELEMENT(m) A_ELEMENT_K(m, 0) | |||
| #define RESET_B_POINTER() b_offset = B; | |||
| #define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale; | |||
| #define UPDATE_B_POINTER(scale) b_offset = b_offset + scale; | |||
| #define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k) * ldb) | |||
| #define B_ELEMENT(n) B_ELEMENT_K(n, 0) | |||
| #define CREATE_C_POINTER(n, scale) FLOAT* c_offset##n = c_offset + scale * ldc; | |||
| #define INCR_C_POINTER(m, incr) // c_offset ## m += incr; | |||
| #define UPDATE_C_POINTER(scale) c_offset = c_offset + scale * ldc; | |||
| #define C_ELEMENT(m, n) *(c_offset##n + ((m * v_size) + i)) | |||
| // #undef C_ELEMENT | |||
| // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] | |||
| #define PACK_ELEMENT_K(n, offset_k) packed_b[(k + offset_k) * 4 + n] | |||
| #define PACK_ELEMENT(n) PACK_ELEMENT_K(n, 0) | |||
| // ASIMD | |||
| #define DECLARE_RESULT_VECTOR4(m, n) \ | |||
| float32x4_t result##m##n = vdupq_n_f32(0.0); | |||
| #define DECLARE_RESULT(m, n) float32_t result##m##n = 0.0; | |||
| #define BROADCAST_LOAD_A4(m, offset_k) \ | |||
| float32x4_t a##m##_k##offset_k = vld1q_dup_f32(&A_ELEMENT_K(m, offset_k)); | |||
| #define LOAD_A1(m, offset_k) \ | |||
| float32_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); | |||
| #define VECTOR_LOAD_B4(n, offset_k) \ | |||
| float32x4_t b##n##_k##offset_k = vld1q_f32(&B_ELEMENT_K(n, offset_k)); | |||
| #define GATHER_LOAD_B4(n, offset_k) \ | |||
| float32x4_t b##n##_k##offset_k = vdupq_n_f32(B_ELEMENT_K(n, offset_k)); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f32(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f32(B_ELEMENT_K(n + 2, offset_k), b##n##_k##offset_k, 2); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f32(B_ELEMENT_K(n + 3, offset_k), b##n##_k##offset_k, 3); | |||
| #define VECTOR_UNPACK_B4(n, offset_k) \ | |||
| float32x4_t b##n##_k##offset_k = vld1q_f32(&PACK_ELEMENT_K(n, offset_k)); | |||
| #define VECTOR_PACK_B4(n, offset_k) \ | |||
| vst1q_f32(&PACK_ELEMENT_K(n, offset_k), b##n##_k##offset_k); | |||
| #define PACK_B0(n, offset_k) \ | |||
| PACK_ELEMENT_K(n, offset_k) = vget_lane_f32(b##n##_k##offset_k, 0); | |||
| #define UPDATE_RESULT_VECTOR4(m, n, offset_k) \ | |||
| result##m##n = \ | |||
| vfmaq_f32(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); | |||
| #define UPDATE_RESULT(m, n, offset_k) \ | |||
| result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; | |||
| #ifdef B0 | |||
| #define SCATTER_STORE4(m, n) \ | |||
| result##m##n = vmulq_f32(result##m##n, vdupq_n_f32(alpha)); \ | |||
| C_ELEMENT(m, n + 0) = vgetq_lane_f32(result##m##n, 0); \ | |||
| C_ELEMENT(m, n + 1) = vgetq_lane_f32(result##m##n, 1); \ | |||
| C_ELEMENT(m, n + 2) = vgetq_lane_f32(result##m##n, 2); \ | |||
| C_ELEMENT(m, n + 3) = vgetq_lane_f32(result##m##n, 3); | |||
| #else | |||
| #define SCATTER_STORE4(m, n) \ | |||
| result##m##n = vmulq_f32(result##m##n, vdupq_n_f32(alpha)); \ | |||
| C_ELEMENT(m, n + 0) = \ | |||
| C_ELEMENT(m, n + 0) * beta + vgetq_lane_f32(result##m##n, 0); \ | |||
| C_ELEMENT(m, n + 1) = \ | |||
| C_ELEMENT(m, n + 1) * beta + vgetq_lane_f32(result##m##n, 1); \ | |||
| C_ELEMENT(m, n + 2) = \ | |||
| C_ELEMENT(m, n + 2) * beta + vgetq_lane_f32(result##m##n, 2); \ | |||
| C_ELEMENT(m, n + 3) = \ | |||
| C_ELEMENT(m, n + 3) * beta + vgetq_lane_f32(result##m##n, 3); | |||
| #endif | |||
| // SVE | |||
| #define DECLARE_RESULT_VECTOR(m, n) svfloat32_t result##m##n = svdup_f32(0.0); | |||
| #define BROADCAST_LOAD_A(m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = svdup_f32(A_ELEMENT_K(m, offset_k)); | |||
| #define BROADCAST_LOAD_B(n, offset_k) \ | |||
| svfloat32_t b##s##n##_k##offset_k = svdup_f32(B_ELEMENT_K(n, offset_k)); | |||
| #define VECTOR_LOAD_A(pg, m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); | |||
| #define QUADWORD_LOAD_B(n, offset_k) \ | |||
| svfloat32_t b##s##n##_k##offset_k = \ | |||
| svld1rq(pg_true, &B_ELEMENT_K(n, offset_k)); | |||
| #define PACK_B(n, offset_k) \ | |||
| svst1(pg_first, &PACK_ELEMENT_K(n, offset_k), b##s##n##_k##offset_k); | |||
| #define VECTOR_PACK_B(n, offset_k) \ | |||
| svst1(pg_true, &PACK_ELEMENT_K(n* v_size, offset_k), b##s##n##_k##offset_k); | |||
| #define QUADWORD_PACK_B(n, offset_k) \ | |||
| svst1(pg_quad, &PACK_ELEMENT_K(n, offset_k), b##s##n##_k##offset_k); | |||
| #define UNPACK_VECTOR_B(n, offset_k) \ | |||
| svfloat32_t b##s##n##_k##offset_k = \ | |||
| svld1(pg_true, &PACK_ELEMENT_K(n * v_size, offset_k)); | |||
| #define UNPACK_BROADCAST_B(n, offset_k) \ | |||
| svfloat32_t b##s##n##_k##offset_k = svdup_f32(PACK_ELEMENT_K(n, offset_k)); | |||
| #define UNPACK_QUADWORD_B(n, offset_k) \ | |||
| svfloat32_t b##s##n##_k##offset_k = \ | |||
| svld1rq(pg_true, &PACK_ELEMENT_K(n, offset_k)); | |||
| #define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); | |||
| #define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ | |||
| result##m##n = svmla_lane( \ | |||
| result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); | |||
| #ifdef B0 | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #else | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = svmla_m(pg, \ | |||
| result##m##n, \ | |||
| svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ | |||
| beta_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #endif | |||
| #ifndef LIKELY | |||
| #ifdef __GNUC__ | |||
| #define LIKELY(x) __builtin_expect(!!(x), 1) | |||
| #else | |||
| #define LIKELY(x) (x) | |||
| #endif | |||
| #endif | |||
| #ifdef B0 | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #else | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT beta, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #endif | |||
| { | |||
| const uint64_t v_size = svcntw(); | |||
| const uint64_t v_size2 = v_size * 2; | |||
| const svbool_t pg_true = svptrue_b32(); | |||
| const svbool_t pg_quad = svwhilelt_b32(0, 4); | |||
| const svbool_t pg_first = svwhilelt_b32(0, 1); | |||
| const svfloat32_t alpha_vec = svdup_f32(alpha); | |||
| #ifndef B0 | |||
| const svfloat32_t beta_vec = svdup_f32(beta); | |||
| #endif | |||
| const BLASLONG n4 = N & -4; | |||
| const BLASLONG v_m2 = M & -v_size2; | |||
| const BLASLONG v_m1 = M & -v_size; | |||
| const int pack_b = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
| FLOAT* packed_b = | |||
| (pack_b) ? packed_b = (FLOAT*)malloc(K * 4 * sizeof(FLOAT)) : NULL; | |||
| FLOAT* b_offset = B; | |||
| FLOAT* a_offset = A; | |||
| FLOAT* c_offset = C; | |||
| BLASLONG j = 0; | |||
| for (; j < n4; j += 4) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_C_POINTER(1, 1); | |||
| CREATE_C_POINTER(2, 2); | |||
| CREATE_C_POINTER(3, 3); | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| UPDATE_A_POINTER(v_size2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| if (LIKELY(packed_b != NULL)) { | |||
| if (i == 0) { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| QUADWORD_PACK_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| UNPACK_QUADWORD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| } | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| VECTOR_STORE(pg_true, 1, 2); | |||
| VECTOR_STORE(pg_true, 1, 3); | |||
| INCR_C_POINTER(0, v_size2); | |||
| INCR_C_POINTER(1, v_size2); | |||
| INCR_C_POINTER(2, v_size2); | |||
| INCR_C_POINTER(3, v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(v_size); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| if (LIKELY(packed_b != NULL)) { | |||
| for (; k < K; k++) { | |||
| UNPACK_QUADWORD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| INCR_C_POINTER(0, v_size); | |||
| INCR_C_POINTER(1, v_size); | |||
| INCR_C_POINTER(2, v_size); | |||
| INCR_C_POINTER(3, v_size); | |||
| } | |||
| for (; i < M; i += v_size) { | |||
| const svbool_t pg_tail = svwhilelt_b32((uint32_t)i, (uint32_t)(M)); | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(0); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| if (LIKELY(packed_b != NULL)) { | |||
| for (; k < K; k++) { | |||
| UNPACK_QUADWORD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| VECTOR_STORE(pg_tail, 0, 1); | |||
| VECTOR_STORE(pg_tail, 0, 2); | |||
| VECTOR_STORE(pg_tail, 0, 3); | |||
| INCR_C_POINTER(0, 0); | |||
| INCR_C_POINTER(1, 0); | |||
| INCR_C_POINTER(2, 0); | |||
| INCR_C_POINTER(3, 0); | |||
| } | |||
| UPDATE_B_POINTER(4); | |||
| RESET_A_POINTER(); | |||
| UPDATE_C_POINTER(4); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_B_POINTER(0, 0); | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| UPDATE_A_POINTER(v_size2); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| VECTOR_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| INCR_C_POINTER(0, v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(v_size); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| INCR_C_POINTER(0, v_size); | |||
| } | |||
| for (; i < M; i += v_size) { | |||
| const svbool_t pg_tail = svwhilelt_b32((uint32_t)i, (uint32_t)(M)); | |||
| CREATE_A_POINTER(0, 0); | |||
| UPDATE_A_POINTER(0); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| VECTOR_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| INCR_C_POINTER(0, 0); | |||
| } | |||
| UPDATE_B_POINTER(1); | |||
| RESET_A_POINTER(); | |||
| UPDATE_C_POINTER(1); | |||
| } | |||
| if (pack_b) | |||
| free(packed_b); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,719 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #include <arm_sve.h> | |||
| #if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ | |||
| __has_include(<arm_neon_sve_bridge.h>) | |||
| #include <arm_neon_sve_bridge.h> | |||
| #else | |||
| #define svdup_neonq_f32(fixed_reg) \ | |||
| ({ \ | |||
| svfloat32_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #define svdup_neonq_f64(fixed_reg) \ | |||
| ({ \ | |||
| svfloat64_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #endif | |||
| #define RESET_A_POINTER() a_offset = A; | |||
| #define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale * lda; | |||
| #define UPDATE_A_POINTER(scale) a_offset = a_offset + scale * lda; | |||
| #define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k)) | |||
| #define A_ELEMENT(m) A_ELEMENT_K(m, 0) | |||
| #define RESET_B_POINTER() b_offset = B; | |||
| #define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale * ldb; | |||
| #define UPDATE_B_POINTER(scale) b_offset = b_offset + scale * ldb; | |||
| #define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k)) | |||
| #define B_ELEMENT(n) B_ELEMENT_K(n, 0) | |||
| #define CREATE_C_POINTER(m, scale) FLOAT* c_offset##m = c_offset + scale; | |||
| #define INCR_C_POINTER(m, incr) // c_offset ## m += incr * ldc; | |||
| #define UPDATE_C_POINTER(scale) c_offset += scale; | |||
| #define C_ELEMENT(m, n) \ | |||
| *(c_offset##m + ((j + n) * ldc)) // C[(i+(m))+(j+(n))*ldc] | |||
| // #undef C_ELEMENT | |||
| // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] | |||
| #define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m] | |||
| #define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) | |||
| // ASIMD | |||
| #define DECLARE_RESULT_VECTOR4(m, n) \ | |||
| float32x4_t result##m##n = vdupq_n_f32(0.0); | |||
| #define DECLARE_RESULT(m, n) float32_t result##m##n = 0.0; | |||
| #define BROADCAST_LOAD_A4(m, offset_k) \ | |||
| float32x4_t a##m##_k##offset_k = vld1q_dup_f32(&A_ELEMENT_K(m, offset_k)); | |||
| #define LOAD_A1(m, offset_k) \ | |||
| float32_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); | |||
| #define GATHER_LOAD_B4(n, offset_k) \ | |||
| float32x4_t b##n##_k##offset_k = vdupq_n_f32(B_ELEMENT_K(n, offset_k)); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f32(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f32(B_ELEMENT_K(n + 2, offset_k), b##n##_k##offset_k, 2); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f32(B_ELEMENT_K(n + 3, offset_k), b##n##_k##offset_k, 3); | |||
| #define VECTOR_UNPACK_B4(n, offset_k) \ | |||
| float32x4_t b##n##_k##offset_k = vld1q_f32(&PACK_ELEMENT_K(n, offset_k)); | |||
| #define PACK_B0(n, offset_k) \ | |||
| PACK_ELEMENT_K(n, offset_k) = vget_lane_f32(b##n##_k##offset_k, 0); | |||
| #define UPDATE_RESULT_VECTOR4(m, n, offset_k) \ | |||
| result##m##n = \ | |||
| vfmaq_f32(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); | |||
| #define UPDATE_RESULT(m, n, offset_k) \ | |||
| result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; | |||
| #ifdef B0 | |||
| #define SCATTER_STORE4(m, n) \ | |||
| result##m##n = vmulq_f32(result##m##n, vdupq_n_f32(alpha)); \ | |||
| C_ELEMENT(m, n + 0) = vgetq_lane_f32(result##m##n, 0); \ | |||
| C_ELEMENT(m, n + 1) = vgetq_lane_f32(result##m##n, 1); \ | |||
| C_ELEMENT(m, n + 2) = vgetq_lane_f32(result##m##n, 2); \ | |||
| C_ELEMENT(m, n + 3) = vgetq_lane_f32(result##m##n, 3); | |||
| #else | |||
| #define SCATTER_STORE4(m, n) \ | |||
| result##m##n = vmulq_f32(result##m##n, vdupq_n_f32(alpha)); \ | |||
| C_ELEMENT(m, n + 0) = \ | |||
| C_ELEMENT(m, n + 0) * beta + vgetq_lane_f32(result##m##n, 0); \ | |||
| C_ELEMENT(m, n + 1) = \ | |||
| C_ELEMENT(m, n + 1) * beta + vgetq_lane_f32(result##m##n, 1); \ | |||
| C_ELEMENT(m, n + 2) = \ | |||
| C_ELEMENT(m, n + 2) * beta + vgetq_lane_f32(result##m##n, 2); \ | |||
| C_ELEMENT(m, n + 3) = \ | |||
| C_ELEMENT(m, n + 3) * beta + vgetq_lane_f32(result##m##n, 3); | |||
| #endif | |||
| // SVE | |||
| #define DECLARE_RESULT_VECTOR(m, n) svfloat32_t result##m##n = svdup_f32(0.0); | |||
| #define BROADCAST_LOAD_A(m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = svdup_f32(A_ELEMENT_K(m, offset_k)); | |||
| #define BROADCAST_LOAD_B(n, offset_k) \ | |||
| svfloat32_t b##s##n##_k##offset_k = svdup_f32(B_ELEMENT_K(n, offset_k)); | |||
| #define VECTOR_LOAD_A(pg, m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); | |||
| #define GATHER_LOAD_A(pg, m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = \ | |||
| svld1_gather_index(pg, &A_ELEMENT_K(m, offset_k), lda_vec); | |||
| #define PACK_A(m, offset_k) \ | |||
| svst1(pg_first, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); | |||
| #define VECTOR_PACK_A(m, offset_k) \ | |||
| svst1(pg_true, &PACK_ELEMENT_K(m* v_size, offset_k), a##s##m##_k##offset_k); | |||
| #define QUADWORD_PACK_A(m, offset_k) \ | |||
| svst1(pg_quad, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); | |||
| #define UNPACK_VECTOR_A(m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = \ | |||
| svld1(pg_true, &PACK_ELEMENT_K(m * v_size, offset_k)); | |||
| #define UNPACK_BROADCAST_A(m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = svdup_f32(PACK_ELEMENT_K(m, offset_k)); | |||
| #define UNPACK_QUADWORD_A(m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = \ | |||
| svld1rq(pg_true, &PACK_ELEMENT_K(m, offset_k)); | |||
| #define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); | |||
| #define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ | |||
| result##m##n = svmla_lane( \ | |||
| result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); | |||
| #ifdef B0 | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #else | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = svmla_m(pg, \ | |||
| result##m##n, \ | |||
| svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ | |||
| beta_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #endif | |||
| #ifndef LIKELY | |||
| #ifdef __GNUC__ | |||
| #define LIKELY(x) __builtin_expect(!!(x), 1) | |||
| #else | |||
| #define LIKELY(x) (x) | |||
| #endif | |||
| #endif | |||
| #ifdef B0 | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #else | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT beta, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #endif | |||
| { | |||
| const uint64_t v_size = svcntw(); | |||
| const uint64_t v_size2 = v_size * 2; | |||
| const svbool_t pg_true = svptrue_b32(); | |||
| const svbool_t pg_quad = svwhilelt_b32(0, 4); | |||
| const svbool_t pg_first = svwhilelt_b32(0, 1); | |||
| const svfloat32_t alpha_vec = svdup_f32(alpha); | |||
| #ifndef B0 | |||
| const svfloat32_t beta_vec = svdup_f32(beta); | |||
| #endif | |||
| const svuint32_t lda_vec = svindex_u32(0LL, lda); | |||
| const BLASLONG v_m2 = M & -v_size2; | |||
| const BLASLONG v_m1 = M & -v_size; | |||
| const BLASLONG n8 = N & -8; | |||
| const BLASLONG n4 = N & -4; | |||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
| FLOAT* packed_a = | |||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
| FLOAT* a_offset = A; | |||
| FLOAT* b_offset = B; | |||
| FLOAT* c_offset = C; | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_C_POINTER(1, v_size); | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| BLASLONG j = 0; | |||
| for (; j < n8; j += 8) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| CREATE_B_POINTER(4, 4); | |||
| CREATE_B_POINTER(5, 5); | |||
| CREATE_B_POINTER(6, 6); | |||
| CREATE_B_POINTER(7, 7); | |||
| UPDATE_B_POINTER(8); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(0, 4); | |||
| DECLARE_RESULT_VECTOR(0, 5); | |||
| DECLARE_RESULT_VECTOR(0, 6); | |||
| DECLARE_RESULT_VECTOR(0, 7); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| DECLARE_RESULT_VECTOR(1, 4); | |||
| DECLARE_RESULT_VECTOR(1, 5); | |||
| DECLARE_RESULT_VECTOR(1, 6); | |||
| DECLARE_RESULT_VECTOR(1, 7); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| if (j == 0) { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| VECTOR_PACK_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| VECTOR_PACK_A(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| BROADCAST_LOAD_B(4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 4, 0); | |||
| BROADCAST_LOAD_B(5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 5, 0); | |||
| BROADCAST_LOAD_B(6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 6, 0); | |||
| BROADCAST_LOAD_B(7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 7, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| BROADCAST_LOAD_B(4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 4, 0); | |||
| BROADCAST_LOAD_B(5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 5, 0); | |||
| BROADCAST_LOAD_B(6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 6, 0); | |||
| BROADCAST_LOAD_B(7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 7, 0); | |||
| } | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| BROADCAST_LOAD_B(4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 4, 0); | |||
| BROADCAST_LOAD_B(5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 5, 0); | |||
| BROADCAST_LOAD_B(6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 6, 0); | |||
| BROADCAST_LOAD_B(7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 7, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 0, 4); | |||
| VECTOR_STORE(pg_true, 0, 5); | |||
| VECTOR_STORE(pg_true, 0, 6); | |||
| VECTOR_STORE(pg_true, 0, 7); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| VECTOR_STORE(pg_true, 1, 2); | |||
| VECTOR_STORE(pg_true, 1, 3); | |||
| VECTOR_STORE(pg_true, 1, 4); | |||
| VECTOR_STORE(pg_true, 1, 5); | |||
| VECTOR_STORE(pg_true, 1, 6); | |||
| VECTOR_STORE(pg_true, 1, 7); | |||
| INCR_C_POINTER(0, 8); | |||
| INCR_C_POINTER(1, 8); | |||
| } | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| VECTOR_STORE(pg_true, 1, 2); | |||
| VECTOR_STORE(pg_true, 1, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| INCR_C_POINTER(1, 4); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_B_POINTER(0, 0); | |||
| UPDATE_B_POINTER(1); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| INCR_C_POINTER(1, 1); | |||
| } | |||
| UPDATE_A_POINTER(v_size2); | |||
| RESET_B_POINTER(); | |||
| UPDATE_C_POINTER(v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_A_POINTER(0, 0); | |||
| BLASLONG j = 0; | |||
| for (; j < n8; j += 8) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| CREATE_B_POINTER(4, 4); | |||
| CREATE_B_POINTER(5, 5); | |||
| CREATE_B_POINTER(6, 6); | |||
| CREATE_B_POINTER(7, 7); | |||
| UPDATE_B_POINTER(8); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(0, 4); | |||
| DECLARE_RESULT_VECTOR(0, 5); | |||
| DECLARE_RESULT_VECTOR(0, 6); | |||
| DECLARE_RESULT_VECTOR(0, 7); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| BROADCAST_LOAD_B(4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 4, 0); | |||
| BROADCAST_LOAD_B(5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 5, 0); | |||
| BROADCAST_LOAD_B(6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 6, 0); | |||
| BROADCAST_LOAD_B(7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 7, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 0, 4); | |||
| VECTOR_STORE(pg_true, 0, 5); | |||
| VECTOR_STORE(pg_true, 0, 6); | |||
| VECTOR_STORE(pg_true, 0, 7); | |||
| INCR_C_POINTER(0, 8); | |||
| } | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 3, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_B_POINTER(0, 0); | |||
| UPDATE_B_POINTER(1); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| } | |||
| UPDATE_A_POINTER(v_size); | |||
| RESET_B_POINTER(); | |||
| UPDATE_C_POINTER(v_size); | |||
| } | |||
| for (; i < M; i += v_size) { | |||
| const svbool_t pg_tail = svwhilelt_b32((uint32_t)i, (uint32_t)(M)); | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_A_POINTER(0, 0); | |||
| BLASLONG j = 0; | |||
| for (; j < n8; j += 8) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| CREATE_B_POINTER(4, 4); | |||
| CREATE_B_POINTER(5, 5); | |||
| CREATE_B_POINTER(6, 6); | |||
| CREATE_B_POINTER(7, 7); | |||
| UPDATE_B_POINTER(8); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(0, 4); | |||
| DECLARE_RESULT_VECTOR(0, 5); | |||
| DECLARE_RESULT_VECTOR(0, 6); | |||
| DECLARE_RESULT_VECTOR(0, 7); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 3, 0); | |||
| BROADCAST_LOAD_B(4, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 4, 0); | |||
| BROADCAST_LOAD_B(5, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 5, 0); | |||
| BROADCAST_LOAD_B(6, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 6, 0); | |||
| BROADCAST_LOAD_B(7, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 7, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| VECTOR_STORE(pg_tail, 0, 1); | |||
| VECTOR_STORE(pg_tail, 0, 2); | |||
| VECTOR_STORE(pg_tail, 0, 3); | |||
| VECTOR_STORE(pg_tail, 0, 4); | |||
| VECTOR_STORE(pg_tail, 0, 5); | |||
| VECTOR_STORE(pg_tail, 0, 6); | |||
| VECTOR_STORE(pg_tail, 0, 7); | |||
| INCR_C_POINTER(0, 8); | |||
| } | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| BROADCAST_LOAD_B(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 1, 0); | |||
| BROADCAST_LOAD_B(2, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 2, 0); | |||
| BROADCAST_LOAD_B(3, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 3, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| VECTOR_STORE(pg_tail, 0, 1); | |||
| VECTOR_STORE(pg_tail, 0, 2); | |||
| VECTOR_STORE(pg_tail, 0, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_B_POINTER(0, 0); | |||
| UPDATE_B_POINTER(1); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| } | |||
| UPDATE_A_POINTER(0); | |||
| RESET_B_POINTER(); | |||
| UPDATE_C_POINTER(0); | |||
| } | |||
| if (pack_a) | |||
| free(packed_a); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,678 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2024, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #include <arm_sve.h> | |||
| #if defined(__ARM_NEON_SVE_BRIDGE) && defined(__has_include) && \ | |||
| __has_include(<arm_neon_sve_bridge.h>) | |||
| #include <arm_neon_sve_bridge.h> | |||
| #else | |||
| #define svdup_neonq_f32(fixed_reg) \ | |||
| ({ \ | |||
| svfloat32_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #define svdup_neonq_f64(fixed_reg) \ | |||
| ({ \ | |||
| svfloat64_t scalable_reg; \ | |||
| asm("mov %0.q, %q1" : "=w"(scalable_reg) : "w"(fixed_reg) :); \ | |||
| scalable_reg; \ | |||
| }) | |||
| #endif | |||
| #define RESET_A_POINTER() a_offset = A; | |||
| #define CREATE_A_POINTER(m, scale) FLOAT* a_offset##m = a_offset + scale * lda; | |||
| #define UPDATE_A_POINTER(scale) a_offset = a_offset + scale * lda; | |||
| #define A_ELEMENT_K(m, offset_k) *(a_offset##m + (k + offset_k)) | |||
| #define A_ELEMENT(m) A_ELEMENT_K(m, 0) | |||
| #define RESET_B_POINTER() b_offset = B; | |||
| #define CREATE_B_POINTER(n, scale) FLOAT* b_offset##n = b_offset + scale; | |||
| #define UPDATE_B_POINTER(scale) b_offset = b_offset + scale; | |||
| #define B_ELEMENT_K(n, offset_k) *(b_offset##n + (k + offset_k) * ldb) | |||
| #define B_ELEMENT(n) B_ELEMENT_K(n, 0) | |||
| #define CREATE_C_POINTER(m, scale) FLOAT* c_offset##m = c_offset + scale; | |||
| #define INCR_C_POINTER(m, incr) // c_offset ## m += incr * ldc; | |||
| #define UPDATE_C_POINTER(scale) c_offset += scale; | |||
| #define C_ELEMENT(m, n) \ | |||
| *(c_offset##m + ((j + n) * ldc)) // C[(i+(m))+(j+(n))*ldc] | |||
| // #undef C_ELEMENT | |||
| // #define C_ELEMENT(m, n) C[(i+(m))+(j+(n))*ldc] | |||
| #define PACK_ELEMENT_K(m, offset_k) packed_a[(k + offset_k) * v_size2 + m] | |||
| #define PACK_ELEMENT(m) PACK_ELEMENT_K(m, 0) | |||
| // ASIMD | |||
| #define DECLARE_RESULT_VECTOR4(m, n) \ | |||
| float32x4_t result##m##n = vdupq_n_f32(0.0); | |||
| #define DECLARE_RESULT(m, n) float32_t result##m##n = 0.0; | |||
| #define BROADCAST_LOAD_A4(m, offset_k) \ | |||
| float32x4_t a##m##_k##offset_k = vld1q_dup_f32(&A_ELEMENT_K(m, offset_k)); | |||
| #define LOAD_A1(m, offset_k) \ | |||
| float32_t a##m##_k##offset_k = A_ELEMENT_K(m, offset_k); | |||
| #define VECTOR_LOAD_B4(n, offset_k) \ | |||
| float32x4_t b##n##_k##offset_k = vld1q_f32(&B_ELEMENT_K(n, offset_k)); | |||
| #define GATHER_LOAD_B4(n, offset_k) \ | |||
| float32x4_t b##n##_k##offset_k = vdupq_n_f32(B_ELEMENT_K(n, offset_k)); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f32(B_ELEMENT_K(n + 1, offset_k), b##n##_k##offset_k, 1); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f32(B_ELEMENT_K(n + 2, offset_k), b##n##_k##offset_k, 2); \ | |||
| b##n##_k##offset_k = \ | |||
| vsetq_lane_f32(B_ELEMENT_K(n + 3, offset_k), b##n##_k##offset_k, 3); | |||
| #define VECTOR_UNPACK_B4(n, offset_k) \ | |||
| float32x4_t b##n##_k##offset_k = vld1q_f32(&PACK_ELEMENT_K(n, offset_k)); | |||
| #define VECTOR_PACK_B4(n, offset_k) \ | |||
| vst1q_f32(&PACK_ELEMENT_K(n, offset_k), b##n##_k##offset_k); | |||
| #define PACK_B0(n, offset_k) \ | |||
| PACK_ELEMENT_K(n, offset_k) = vget_lane_f32(b##n##_k##offset_k, 0); | |||
| #define UPDATE_RESULT_VECTOR4(m, n, offset_k) \ | |||
| result##m##n = \ | |||
| vfmaq_f32(result##m##n, a##m##_k##offset_k, b##n##_k##offset_k); | |||
| #define UPDATE_RESULT(m, n, offset_k) \ | |||
| result##m##n = result##m##n + a##m##_k##offset_k * b##n##_k##offset_k; | |||
| #ifdef B0 | |||
| #define VECTOR_STORE4(m, n) \ | |||
| vst1q_f32(&C_ELEMENT(m, n), vmulq_f32(result##m##n, vdupq_n_f32(alpha))); | |||
| #define STORE(m, n) C_ELEMENT(m, n) = alpha * result##m##n; | |||
| #else | |||
| #define VECTOR_STORE4(m, n) \ | |||
| result##m##n = vmulq_f32(result##m##n, vdupq_n_f32(alpha)); \ | |||
| result##m##n = \ | |||
| vfmaq_f32(result##m##n, vld1q_f32(&C_ELEMENT(m, n)), vdupq_n_f32(beta)); \ | |||
| vst1q_f32(&C_ELEMENT(m, n), result##m##n); | |||
| #define STORE(m, n) \ | |||
| C_ELEMENT(m, n) = C_ELEMENT(m, n) * beta + alpha * result##m##n; | |||
| #endif | |||
| // SVE | |||
| #define DECLARE_RESULT_VECTOR(m, n) svfloat32_t result##m##n = svdup_f32(0.0); | |||
| #define BROADCAST_LOAD_A(m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = svdup_f32(A_ELEMENT_K(m, offset_k)); | |||
| #define BROADCAST_LOAD_B(n, offset_k) \ | |||
| svfloat32_t b##s##n##_k##offset_k = svdup_f32(B_ELEMENT_K(n, offset_k)); | |||
| #define VECTOR_LOAD_A(pg, m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = svld1(pg, &A_ELEMENT_K(m, offset_k)); | |||
| #define QUADWORD_LOAD_B(n, offset_k) \ | |||
| svfloat32_t b##s##n##_k##offset_k = \ | |||
| svld1rq(pg_true, &B_ELEMENT_K(n, offset_k)); | |||
| #define GATHER_LOAD_A(pg, m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = \ | |||
| svld1_gather_index(pg, &A_ELEMENT_K(m, offset_k), lda_vec); | |||
| #define PACK_A(m, offset_k) \ | |||
| svst1(pg_first, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); | |||
| #define VECTOR_PACK_A(m, offset_k) \ | |||
| svst1(pg_true, &PACK_ELEMENT_K(m* v_size, offset_k), a##s##m##_k##offset_k); | |||
| #define QUADWORD_PACK_A(m, offset_k) \ | |||
| svst1(pg_quad, &PACK_ELEMENT_K(m, offset_k), a##s##m##_k##offset_k); | |||
| #define UNPACK_VECTOR_A(m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = \ | |||
| svld1(pg_true, &PACK_ELEMENT_K(m * v_size, offset_k)); | |||
| #define UNPACK_BROADCAST_A(m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = svdup_f32(PACK_ELEMENT_K(m, offset_k)); | |||
| #define UNPACK_QUADWORD_A(m, offset_k) \ | |||
| svfloat32_t a##s##m##_k##offset_k = \ | |||
| svld1rq(pg_true, &PACK_ELEMENT_K(m, offset_k)); | |||
| #define UPDATE_RESULT_VECTOR(pg, m, n, offset_k) \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, a##s##m##_k##offset_k, b##s##n##_k##offset_k); | |||
| #define UPDATE_RESULT_VECTOR_QUADWORD(m, n, outer, lane, offset_k) \ | |||
| result##m##n = svmla_lane( \ | |||
| result##m##n, a##s##m##_k##offset_k, b##s##outer##_k##offset_k, lane); | |||
| #ifdef B0 | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #else | |||
| #define VECTOR_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = \ | |||
| svmla_m(pg, result##m##n, svld1(pg, &C_ELEMENT(m, n)), beta_vec); \ | |||
| svst1(pg, &C_ELEMENT(m, n), result##m##n); | |||
| #define SCATTER_STORE(pg, m, n) \ | |||
| result##m##n = svmul_m(pg, result##m##n, alpha_vec); \ | |||
| result##m##n = svmla_m(pg, \ | |||
| result##m##n, \ | |||
| svld1_gather_index(pg, &C_ELEMENT(m, n), ldc_vec), \ | |||
| beta_vec); \ | |||
| svst1_scatter_index(pg, &C_ELEMENT(m, n), ldc_vec, result##m##n); | |||
| #endif | |||
| #ifndef LIKELY | |||
| #ifdef __GNUC__ | |||
| #define LIKELY(x) __builtin_expect(!!(x), 1) | |||
| #else | |||
| #define LIKELY(x) (x) | |||
| #endif | |||
| #endif | |||
| #ifdef B0 | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #else | |||
| int | |||
| CNAME(BLASLONG M, | |||
| BLASLONG N, | |||
| BLASLONG K, | |||
| IFLOAT* A, | |||
| BLASLONG lda, | |||
| FLOAT alpha, | |||
| IFLOAT* B, | |||
| BLASLONG ldb, | |||
| FLOAT beta, | |||
| FLOAT* C, | |||
| BLASLONG ldc) | |||
| #endif | |||
| { | |||
| const uint64_t v_size = svcntw(); | |||
| const uint64_t v_size2 = v_size * 2; | |||
| const svbool_t pg_true = svptrue_b32(); | |||
| const svbool_t pg_quad = svwhilelt_b32(0, 4); | |||
| const svbool_t pg_first = svwhilelt_b32(0, 1); | |||
| const svfloat32_t alpha_vec = svdup_f32(alpha); | |||
| #ifndef B0 | |||
| const svfloat32_t beta_vec = svdup_f32(beta); | |||
| #endif | |||
| const svuint32_t lda_vec = svindex_u32(0LL, lda); | |||
| const BLASLONG v_m2 = M & -v_size2; | |||
| const BLASLONG v_m1 = M & -v_size; | |||
| const BLASLONG n8 = N & -8; | |||
| const BLASLONG n4 = N & -4; | |||
| const int pack_a = M >= v_size2 && N >= 8 && K >= 8 ? 1 : 0; | |||
| FLOAT* packed_a = | |||
| (pack_a) ? packed_a = (FLOAT*)malloc(K * v_size2 * sizeof(FLOAT)) : NULL; | |||
| FLOAT* a_offset = A; | |||
| FLOAT* b_offset = B; | |||
| FLOAT* c_offset = C; | |||
| BLASLONG i = 0; | |||
| for (; i < v_m2; i += v_size2) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_C_POINTER(1, v_size); | |||
| CREATE_A_POINTER(0, 0); | |||
| CREATE_A_POINTER(1, v_size); | |||
| BLASLONG j = 0; | |||
| for (; j < n8; j += 8) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| CREATE_B_POINTER(4, 4); | |||
| CREATE_B_POINTER(5, 5); | |||
| CREATE_B_POINTER(6, 6); | |||
| CREATE_B_POINTER(7, 7); | |||
| UPDATE_B_POINTER(8); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(0, 4); | |||
| DECLARE_RESULT_VECTOR(0, 5); | |||
| DECLARE_RESULT_VECTOR(0, 6); | |||
| DECLARE_RESULT_VECTOR(0, 7); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| DECLARE_RESULT_VECTOR(1, 4); | |||
| DECLARE_RESULT_VECTOR(1, 5); | |||
| DECLARE_RESULT_VECTOR(1, 6); | |||
| DECLARE_RESULT_VECTOR(1, 7); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| if (j == 0) { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| VECTOR_PACK_A(0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| QUADWORD_LOAD_B(4, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 4, 4, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 5, 4, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 6, 4, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 7, 4, 3, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| VECTOR_PACK_A(1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 4, 4, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 5, 4, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 6, 4, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 7, 4, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| QUADWORD_LOAD_B(4, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 4, 4, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 5, 4, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 6, 4, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 7, 4, 3, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 4, 4, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 5, 4, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 6, 4, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 7, 4, 3, 0); | |||
| } | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| QUADWORD_LOAD_B(4, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 4, 4, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 5, 4, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 6, 4, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 7, 4, 3, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 4, 4, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 5, 4, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 6, 4, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 7, 4, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 0, 4); | |||
| VECTOR_STORE(pg_true, 0, 5); | |||
| VECTOR_STORE(pg_true, 0, 6); | |||
| VECTOR_STORE(pg_true, 0, 7); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| VECTOR_STORE(pg_true, 1, 2); | |||
| VECTOR_STORE(pg_true, 1, 3); | |||
| VECTOR_STORE(pg_true, 1, 4); | |||
| VECTOR_STORE(pg_true, 1, 5); | |||
| VECTOR_STORE(pg_true, 1, 6); | |||
| VECTOR_STORE(pg_true, 1, 7); | |||
| INCR_C_POINTER(0, 8); | |||
| INCR_C_POINTER(1, 8); | |||
| } | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| DECLARE_RESULT_VECTOR(1, 1); | |||
| DECLARE_RESULT_VECTOR(1, 2); | |||
| DECLARE_RESULT_VECTOR(1, 3); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(1, 3, 0, 3, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| VECTOR_STORE(pg_true, 1, 1); | |||
| VECTOR_STORE(pg_true, 1, 2); | |||
| VECTOR_STORE(pg_true, 1, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| INCR_C_POINTER(1, 4); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_B_POINTER(0, 0); | |||
| UPDATE_B_POINTER(1); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(1, 0); | |||
| if (LIKELY(packed_a != NULL)) { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| UNPACK_VECTOR_A(0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| UNPACK_VECTOR_A(1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| } else { | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| GATHER_LOAD_A(pg_true, 1, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 1, 0, 0); | |||
| } | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 1, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| INCR_C_POINTER(1, 1); | |||
| } | |||
| UPDATE_A_POINTER(v_size2); | |||
| RESET_B_POINTER(); | |||
| UPDATE_C_POINTER(v_size2); | |||
| } | |||
| for (; i < v_m1; i += v_size) { | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_A_POINTER(0, 0); | |||
| BLASLONG j = 0; | |||
| for (; j < n8; j += 8) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| CREATE_B_POINTER(4, 4); | |||
| CREATE_B_POINTER(5, 5); | |||
| CREATE_B_POINTER(6, 6); | |||
| CREATE_B_POINTER(7, 7); | |||
| UPDATE_B_POINTER(8); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(0, 4); | |||
| DECLARE_RESULT_VECTOR(0, 5); | |||
| DECLARE_RESULT_VECTOR(0, 6); | |||
| DECLARE_RESULT_VECTOR(0, 7); | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| QUADWORD_LOAD_B(4, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 4, 4, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 5, 4, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 6, 4, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 7, 4, 3, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| VECTOR_STORE(pg_true, 0, 4); | |||
| VECTOR_STORE(pg_true, 0, 5); | |||
| VECTOR_STORE(pg_true, 0, 6); | |||
| VECTOR_STORE(pg_true, 0, 7); | |||
| INCR_C_POINTER(0, 8); | |||
| } | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| VECTOR_STORE(pg_true, 0, 1); | |||
| VECTOR_STORE(pg_true, 0, 2); | |||
| VECTOR_STORE(pg_true, 0, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_B_POINTER(0, 0); | |||
| UPDATE_B_POINTER(1); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_true, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_true, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_true, 0, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| } | |||
| UPDATE_A_POINTER(v_size); | |||
| RESET_B_POINTER(); | |||
| UPDATE_C_POINTER(v_size); | |||
| } | |||
| for (; i < M; i += v_size) { | |||
| const svbool_t pg_tail = svwhilelt_b32((uint32_t)i, (uint32_t)(M)); | |||
| CREATE_C_POINTER(0, 0); | |||
| CREATE_A_POINTER(0, 0); | |||
| BLASLONG j = 0; | |||
| for (; j < n8; j += 8) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| CREATE_B_POINTER(4, 4); | |||
| CREATE_B_POINTER(5, 5); | |||
| CREATE_B_POINTER(6, 6); | |||
| CREATE_B_POINTER(7, 7); | |||
| UPDATE_B_POINTER(8); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| DECLARE_RESULT_VECTOR(0, 4); | |||
| DECLARE_RESULT_VECTOR(0, 5); | |||
| DECLARE_RESULT_VECTOR(0, 6); | |||
| DECLARE_RESULT_VECTOR(0, 7); | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| QUADWORD_LOAD_B(4, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 4, 4, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 5, 4, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 6, 4, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 7, 4, 3, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| VECTOR_STORE(pg_tail, 0, 1); | |||
| VECTOR_STORE(pg_tail, 0, 2); | |||
| VECTOR_STORE(pg_tail, 0, 3); | |||
| VECTOR_STORE(pg_tail, 0, 4); | |||
| VECTOR_STORE(pg_tail, 0, 5); | |||
| VECTOR_STORE(pg_tail, 0, 6); | |||
| VECTOR_STORE(pg_tail, 0, 7); | |||
| INCR_C_POINTER(0, 8); | |||
| } | |||
| for (; j < n4; j += 4) { | |||
| CREATE_B_POINTER(0, 0); | |||
| CREATE_B_POINTER(1, 1); | |||
| CREATE_B_POINTER(2, 2); | |||
| CREATE_B_POINTER(3, 3); | |||
| UPDATE_B_POINTER(4); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| DECLARE_RESULT_VECTOR(0, 1); | |||
| DECLARE_RESULT_VECTOR(0, 2); | |||
| DECLARE_RESULT_VECTOR(0, 3); | |||
| for (; k < K; k++) { | |||
| QUADWORD_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 0, 0, 0, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 1, 0, 1, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 2, 0, 2, 0); | |||
| UPDATE_RESULT_VECTOR_QUADWORD(0, 3, 0, 3, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| VECTOR_STORE(pg_tail, 0, 1); | |||
| VECTOR_STORE(pg_tail, 0, 2); | |||
| VECTOR_STORE(pg_tail, 0, 3); | |||
| INCR_C_POINTER(0, 4); | |||
| } | |||
| for (; j < N; j++) { | |||
| CREATE_B_POINTER(0, 0); | |||
| UPDATE_B_POINTER(1); | |||
| BLASLONG k = 0; | |||
| DECLARE_RESULT_VECTOR(0, 0); | |||
| for (; k < K; k++) { | |||
| BROADCAST_LOAD_B(0, 0); | |||
| GATHER_LOAD_A(pg_tail, 0, 0); | |||
| UPDATE_RESULT_VECTOR(pg_tail, 0, 0, 0); | |||
| } | |||
| VECTOR_STORE(pg_tail, 0, 0); | |||
| INCR_C_POINTER(0, 1); | |||
| } | |||
| UPDATE_A_POINTER(0); | |||
| RESET_B_POINTER(); | |||
| UPDATE_C_POINTER(0); | |||
| } | |||
| if (pack_a) | |||
| free(packed_a); | |||
| return 0; | |||
| } | |||
| @@ -53,9 +53,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| li.d TEMP, SIZE | |||
| ld.d XX, $sp, 0 // Load dummy2 | |||
| slli.d XX, XX, BASE_SHIFT | |||
| MTC a1, $r0 | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| bge $r0, N, .L999 | |||
| CMPEQ $fcc0, ALPHA, a1 | |||
| bceqz $fcc0, .L50 | |||
| beq XX, TEMP, .L50 // if dummp2 == 1, do not directly copy 0 | |||
| srai.d I, N, 3 | |||
| bne INCX, TEMP, .L20 | |||
| bge $r0, I, .L15 | |||
| .align 3 | |||
| .L12: | |||
| ST a1, X, 0 * SIZE | |||
| ST a1, X, 1 * SIZE | |||
| ST a1, X, 2 * SIZE | |||
| ST a1, X, 3 * SIZE | |||
| ST a1, X, 4 * SIZE | |||
| ST a1, X, 5 * SIZE | |||
| ST a1, X, 6 * SIZE | |||
| ST a1, X, 7 * SIZE | |||
| addi.w I, I, -1 | |||
| addi.d X, X, 8 * SIZE | |||
| blt $r0, I, .L12 | |||
| .align 3 | |||
| .L15: | |||
| andi I, N, 7 | |||
| bge $r0, I, .L999 | |||
| .align 3 | |||
| .L16: | |||
| ST a1, X, 0 * SIZE | |||
| addi.d I, I, -1 | |||
| addi.d X, X, SIZE | |||
| blt $r0, I, .L16 | |||
| move $r4, $r17 | |||
| fmov.d $f0, $f22 | |||
| jirl $r0, $r1, 0x0 | |||
| .align 3 | |||
| .L20: | |||
| srai.d I, N, 3 | |||
| bge $r0, I, .L25 | |||
| .align 3 | |||
| .L22: | |||
| ST a1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 * SIZE | |||
| addi.d I, I, -1 | |||
| add.d X, X, INCX | |||
| blt $r0, I, .L22 | |||
| .align 3 | |||
| .L25: | |||
| andi I, N, 7 | |||
| bge $r0, I, .L999 | |||
| .align 3 | |||
| .L26: | |||
| addi.d I, I, -1 | |||
| ST a1, X, 0 * SIZE | |||
| add.d X, X, INCX | |||
| blt $r0, I, .L26 | |||
| move $r4, $r17 | |||
| fmov.d $f0, $f22 | |||
| jirl $r0, $r1, 0x0 | |||
| .align 3 | |||
| .L50: | |||
| srai.d I, N, 3 | |||
| @@ -52,17 +52,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bge $r0, N, .L999 | |||
| bge $r0, INCX, .L999 | |||
| li.d TEMP, 1 | |||
| ld.d t1, $sp, 0 // Load dummp2 | |||
| movgr2fr.d a1, $r0 | |||
| FFINT a1, a1 | |||
| movgr2fr.d a2, TEMP | |||
| FFINT a2, a2 | |||
| slli.d TEMP, TEMP, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| slli.d t1, t1, BASE_SHIFT | |||
| CMPEQ $fcc0, ALPHA, a1 | |||
| bcnez $fcc0, .L20 //ALPHA==0 | |||
| CMPEQ $fcc0, ALPHA, a2 | |||
| bcnez $fcc0, .L999 //ALPHA==1 return | |||
| .L1: | |||
| srai.d I, N, 3 | |||
| beq INCX, TEMP, .L30 //ALPHA!=1 and INCX==1 | |||
| beq INCX, TEMP, .L30 //ALPHA !=0|1 and INCX==1 | |||
| MTG TEMP, ALPHA | |||
| #ifdef DOUBLE | |||
| xvreplgr2vr.d VALPHA, TEMP | |||
| @@ -72,7 +76,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| move XX, X | |||
| .align 3 | |||
| .L10: //ALPHA!=1 and INCX!=1 | |||
| .L10: //ALPHA !=0|1 and INCX!=1 | |||
| bge $r0, I, .L32 | |||
| .align 3 | |||
| .L11: | |||
| @@ -165,6 +169,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| blt $r0, I, .L11 | |||
| b .L32 | |||
| .align 3 | |||
| .L20: | |||
| beq t1, TEMP, .L1 // if dummp2 == 1, do not directly copy 0 | |||
| srai.d I, N, 3 | |||
| beq INCX, TEMP, .L24 | |||
| bge $r0, I, .L22 | |||
| .align 3 | |||
| .L21: | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L21 | |||
| .align 3 | |||
| .L22: | |||
| andi I, N, 7 | |||
| bge $r0, I, .L999 | |||
| .align 3 | |||
| .L23: | |||
| ST a1, X, 0 * SIZE | |||
| addi.d I, I, -1 | |||
| add.d X, X, INCX | |||
| blt $r0, I, .L23 | |||
| jirl $r0, $r1, 0 | |||
| .align 3 | |||
| .L24: | |||
| bge $r0, I, .L26 /*N<8 INCX==1*/ | |||
| .align 3 | |||
| .L25: | |||
| xvxor.v VX0, VX0, VX0 | |||
| xvst VX0, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| xvst VX0, X, 4 * SIZE | |||
| #endif | |||
| addi.d I, I, -1 | |||
| addi.d X, X, 8 * SIZE | |||
| blt $r0, I, .L25 | |||
| .align 3 | |||
| .L26: | |||
| andi I, N, 7 | |||
| bge $r0, I, .L999 | |||
| .align 3 | |||
| .L27: | |||
| ST a1, X, 0 * SIZE | |||
| addi.d I, I, -1 | |||
| addi.d X, X, SIZE | |||
| blt $r0, I, .L27 | |||
| jirl $r0, $r1, 0 | |||
| .align 3 | |||
| .L30: | |||
| bge $r0, I, .L32/*N<8 INCX==1*/ | |||
| MTG TEMP, ALPHA | |||
| @@ -51,6 +51,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bge $r0, N, .L999 | |||
| bge $r0, INCX, .L999 | |||
| ld.d t1, $sp, 0 // Load dummy2 | |||
| li.d TEMP, 1 | |||
| movgr2fr.d a1, $r0 | |||
| FFINT a1, a1 | |||
| @@ -58,10 +59,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| FFINT a2, a2 | |||
| slli.d TEMP, TEMP, BASE_SHIFT | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| slli.d t1, t1, BASE_SHIFT | |||
| CMPEQ $fcc0, ALPHA, a1 | |||
| bcnez $fcc0, .L20 //ALPHA==0 | |||
| CMPEQ $fcc0, ALPHA, a2 | |||
| bcnez $fcc0, .L999 //ALPHA==1 return | |||
| .L1: | |||
| srai.d I, N, 3 | |||
| beq INCX, TEMP, .L30 //ALPHA!=1 and INCX==1 | |||
| beq INCX, TEMP, .L30 //ALPHA !=0|1 and INCX==1 | |||
| MTG TEMP, ALPHA | |||
| #ifdef DOUBLE | |||
| vreplgr2vr.d VALPHA, TEMP | |||
| @@ -71,7 +76,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| move XX, X | |||
| .align 3 | |||
| .L10: //ALPHA!=1 and INCX!=1 | |||
| .L10: //ALPHA !=0|1 and INCX!=1 | |||
| bge $r0, I, .L32 | |||
| .align 3 | |||
| @@ -169,6 +174,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| b .L32 | |||
| .align 3 | |||
| .L20: | |||
| beq t1, TEMP, .L1 // if dummp2 == 1, do not directly copy 0 | |||
| srai.d I, N, 3 | |||
| beq INCX, TEMP, .L24 | |||
| bge $r0, I, .L22 | |||
| .align 3 | |||
| .L21: | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| ST a1, X, 0 | |||
| add.d X, X, INCX | |||
| addi.d I, I, -1 | |||
| blt $r0, I, .L21 | |||
| .align 3 | |||
| .L22: | |||
| andi I, N, 7 | |||
| bge $r0, I, .L999 | |||
| .align 3 | |||
| .L23: | |||
| ST a1, X, 0 * SIZE | |||
| addi.d I, I, -1 | |||
| add.d X, X, INCX | |||
| blt $r0, I, .L23 | |||
| jirl $r0, $r1, 0 | |||
| .align 3 | |||
| .L24: | |||
| bge $r0, I, .L26 /*N<8 INCX==1*/ | |||
| .align 3 | |||
| .L25: | |||
| vxor.v VX0, VX0, VX0 | |||
| vst VX0, X, 0 * SIZE | |||
| #ifdef DOUBLE | |||
| vst VX0, X, 2 * SIZE | |||
| vst VX0, X, 4 * SIZE | |||
| vst VX0, X, 6 * SIZE | |||
| #else | |||
| vst VX0, X, 4 * SIZE | |||
| #endif | |||
| addi.d I, I, -1 | |||
| addi.d X, X, 8 * SIZE | |||
| blt $r0, I, .L25 | |||
| .align 3 | |||
| .L26: | |||
| andi I, N, 7 | |||
| bge $r0, I, .L999 | |||
| .align 3 | |||
| .L27: | |||
| ST a1, X, 0 * SIZE | |||
| addi.d I, I, -1 | |||
| addi.d X, X, SIZE | |||
| blt $r0, I, .L27 | |||
| jirl $r0, $r1, 0 | |||
| .align 3 | |||
| .L30: | |||
| bge $r0, I, .L32/*N<8 INCX==1*/ | |||
| MTG TEMP, ALPHA | |||
| @@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
| if (1 == inc_x) | |||
| { | |||
| if (0) //if (0.0 == da ) | |||
| if (0.0 == da && !dummy2) | |||
| { | |||
| v2f64 zero_v = {0.0, 0.0}; | |||
| @@ -240,14 +240,12 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
| } | |||
| else | |||
| { | |||
| if (da == 0.0) | |||
| if (da == 0.0 && !dummy2) | |||
| { | |||
| for (i = n; i--;) | |||
| { | |||
| if (isfinite(*x)) | |||
| *x = 0.0; | |||
| else | |||
| *x = NAN; | |||
| { | |||
| *x = 0.0; | |||
| x += inc_x; | |||
| } | |||
| } | |||
| @@ -29,27 +29,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0,j=0; | |||
| while(j < n) | |||
| { | |||
| if ( da == 0.0 ) | |||
| if (isnan(x[i])||isinf(x[i])) | |||
| x[i]=NAN; | |||
| else | |||
| x[i]=0.0; | |||
| else if (isnan(da)) | |||
| x[i]=NAN; | |||
| else | |||
| x[i] = da * x[i] ; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| return 0; | |||
| BLASLONG i = 0, j = 0; | |||
| // Resolved issue 4728 when the caller is {s/d}scal | |||
| if (da == 0.0 && dummy2 == 1) | |||
| { | |||
| while(j < n) | |||
| { | |||
| x[i] = da * x[i] ; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| while(j < n) | |||
| { | |||
| if ( da == 0.0 ) | |||
| x[i] = 0.0; | |||
| else | |||
| x[i] = da * x[i] ; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -42,7 +42,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
| if (1 == inc_x) | |||
| { | |||
| if (0) // if (0.0 == da) | |||
| if (0.0 == da && !dummy2) | |||
| { | |||
| v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; | |||
| @@ -255,14 +255,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
| } | |||
| else | |||
| { | |||
| if (0.0 == da) | |||
| if (0.0 == da && !dummy2) | |||
| { | |||
| for (i = n; i--;) | |||
| { | |||
| if (isfinite(*x)) | |||
| *x = 0; | |||
| else | |||
| *x = NAN; | |||
| *x = 0; | |||
| x += inc_x; | |||
| } | |||
| } | |||
| @@ -48,6 +48,7 @@ | |||
| #define TEMP $3 | |||
| #define XX $5 | |||
| #define DUMMY2 $6 | |||
| #define ALPHA $f15 | |||
| @@ -73,13 +74,13 @@ | |||
| blez N, .L999 | |||
| dsll INCX, INCX, BASE_SHIFT | |||
| CMPEQ $fcc0, ALPHA, a1 | |||
| NOP | |||
| CMPEQ $fcc0, ALPHA, a1 | |||
| LDARG DUMMY2, 8($sp) | |||
| bc1f $fcc0, .L50 | |||
| NOP | |||
| dsll DUMMY2, DUMMY2, BASE_SHIFT | |||
| bc1t $fcc0, .L50 | |||
| beq DUMMY2, TEMP, .L50 // If dummy2 == 1, do not directly copy 0 | |||
| NOP | |||
| bne INCX, TEMP, .L20 | |||
| @@ -73,6 +73,15 @@ static void dscal_kernel_8_zero (BLASLONG n, FLOAT *x) | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| x[0] = alpha; | |||
| x[1] = alpha; | |||
| x[2] = alpha; | |||
| x[3] = alpha; | |||
| x[4] = alpha; | |||
| x[5] = alpha; | |||
| x[6] = alpha; | |||
| x[7] = alpha; | |||
| #if 0 | |||
| if(isfinite(x[0])) | |||
| x[0] = alpha; | |||
| else | |||
| @@ -106,7 +115,8 @@ static void dscal_kernel_8_zero (BLASLONG n, FLOAT *x) | |||
| else | |||
| x[7] = NAN; | |||
| x+=8; | |||
| } | |||
| #endif | |||
| } | |||
| } | |||
| @@ -130,6 +140,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| if (dummy2 == 0) | |||
| for (j = 0; j < align; j++) { | |||
| x [j] = 0.0; | |||
| } | |||
| else | |||
| for (j = 0; j < align; j++) { | |||
| if (isfinite(x[j])) | |||
| x[j] = 0.0; | |||
| @@ -151,7 +166,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| j=n1; | |||
| } | |||
| #endif | |||
| if (dummy2 == 0) | |||
| while(j < n) | |||
| { | |||
| x[j]=0.0; | |||
| j++; | |||
| } | |||
| else | |||
| while(j < n) | |||
| { | |||
| if (!isfinite(x[j])) | |||
| @@ -202,7 +223,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( da == 0.0 ) | |||
| { | |||
| if (dummy2 == 0) | |||
| while(j < n) | |||
| { | |||
| x[i]=0.0; | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| else | |||
| while(j < n) | |||
| { | |||
| if (!isfinite(x[i])) | |||
| @@ -864,15 +864,15 @@ LL(22): | |||
| LFD f22, 10 * SIZE(BO) | |||
| LFD f23, 11 * SIZE(BO) | |||
| FMADD f2, f18, f24, f2 | |||
| FMADD f3, f19, f24, f3 | |||
| FMADD f6, f18, f25, f6 | |||
| FMADD f7, f19, f25, f7 | |||
| FMADD f0, f18, f24, f0 | |||
| FMADD f1, f19, f24, f1 | |||
| FMADD f4, f18, f25, f4 | |||
| FMADD f5, f19, f25, f5 | |||
| FMADD f10, f18, f26, f10 | |||
| FMADD f11, f19, f26, f11 | |||
| FMADD f14, f18, f27, f14 | |||
| FMADD f15, f19, f27, f15 | |||
| FMADD f8, f18, f26, f8 | |||
| FMADD f9, f19, f26, f9 | |||
| FMADD f12, f18, f27, f12 | |||
| FMADD f13, f19, f27, f13 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -899,15 +899,15 @@ LL(22): | |||
| LFD f22, 18 * SIZE(BO) | |||
| LFD f23, 19 * SIZE(BO) | |||
| FMADD f2, f18, f24, f2 | |||
| FMADD f3, f19, f24, f3 | |||
| FMADD f6, f18, f25, f6 | |||
| FMADD f7, f19, f25, f7 | |||
| FMADD f0, f18, f24, f0 | |||
| FMADD f1, f19, f24, f1 | |||
| FMADD f4, f18, f25, f4 | |||
| FMADD f5, f19, f25, f5 | |||
| FMADD f10, f18, f26, f10 | |||
| FMADD f11, f19, f26, f11 | |||
| FMADD f14, f18, f27, f14 | |||
| FMADD f15, f19, f27, f15 | |||
| FMADD f8, f18, f26, f8 | |||
| FMADD f9, f19, f26, f9 | |||
| FMADD f12, f18, f27, f12 | |||
| FMADD f13, f19, f27, f13 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -923,14 +923,6 @@ LL(22): | |||
| addi BO, BO, 16 * SIZE | |||
| bdnz LL(22) | |||
| fadd f0, f2, f0 | |||
| fadd f1, f3, f1 | |||
| fadd f4, f6, f4 | |||
| fadd f5, f7, f5 | |||
| fadd f8, f10, f8 | |||
| fadd f9, f11, f9 | |||
| fadd f12, f14, f12 | |||
| fadd f13, f15, f13 | |||
| .align 4 | |||
| LL(25): | |||
| @@ -1161,10 +1153,10 @@ LL(32): | |||
| LFD f22, 10 * SIZE(BO) | |||
| LFD f23, 11 * SIZE(BO) | |||
| FMADD f1, f17, f24, f1 | |||
| FMADD f5, f17, f25, f5 | |||
| FMADD f9, f17, f26, f9 | |||
| FMADD f13, f17, f27, f13 | |||
| FMADD f0, f17, f24, f0 | |||
| FMADD f4, f17, f25, f4 | |||
| FMADD f8, f17, f26, f8 | |||
| FMADD f12, f17, f27, f12 | |||
| LFD f24, 12 * SIZE(BO) | |||
| LFD f25, 13 * SIZE(BO) | |||
| @@ -1181,10 +1173,10 @@ LL(32): | |||
| LFD f22, 18 * SIZE(BO) | |||
| LFD f23, 19 * SIZE(BO) | |||
| FMADD f1, f19, f24, f1 | |||
| FMADD f5, f19, f25, f5 | |||
| FMADD f9, f19, f26, f9 | |||
| FMADD f13, f19, f27, f13 | |||
| FMADD f0, f19, f24, f0 | |||
| FMADD f4, f19, f25, f4 | |||
| FMADD f8, f19, f26, f8 | |||
| FMADD f12, f19, f27, f12 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -1200,10 +1192,6 @@ LL(32): | |||
| addi BO, BO, 16 * SIZE | |||
| bdnz LL(32) | |||
| fadd f0, f1, f0 | |||
| fadd f4, f5, f4 | |||
| fadd f8, f9, f8 | |||
| fadd f12, f13, f12 | |||
| .align 4 | |||
| LL(35): | |||
| @@ -1691,10 +1679,10 @@ LL(52): | |||
| FMADD f2, f16, f21, f2 | |||
| FMADD f3, f17, f21, f3 | |||
| FMADD f4, f18, f22, f4 | |||
| FMADD f5, f19, f22, f5 | |||
| FMADD f6, f18, f23, f6 | |||
| FMADD f7, f19, f23, f7 | |||
| FMADD f0, f18, f22, f0 | |||
| FMADD f1, f19, f22, f1 | |||
| FMADD f2, f18, f23, f2 | |||
| FMADD f3, f19, f23, f3 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -1711,10 +1699,10 @@ LL(52): | |||
| FMADD f2, f16, f25, f2 | |||
| FMADD f3, f17, f25, f3 | |||
| FMADD f4, f18, f26, f4 | |||
| FMADD f5, f19, f26, f5 | |||
| FMADD f6, f18, f27, f6 | |||
| FMADD f7, f19, f27, f7 | |||
| FMADD f0, f18, f26, f0 | |||
| FMADD f1, f19, f26, f1 | |||
| FMADD f2, f18, f27, f2 | |||
| FMADD f3, f19, f27, f3 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -1775,21 +1763,11 @@ LL(58): | |||
| LFD f18, 0 * SIZE(CO2) | |||
| LFD f19, 1 * SIZE(CO2) | |||
| FADD f0, f4, f0 | |||
| FADD f1, f5, f1 | |||
| FADD f2, f6, f2 | |||
| FADD f3, f7, f3 | |||
| FMADD f0, f0, f30, f16 | |||
| FMADD f1, f1, f30, f17 | |||
| FMADD f2, f2, f30, f18 | |||
| FMADD f3, f3, f30, f19 | |||
| #else | |||
| FADD f0, f4, f0 | |||
| FADD f1, f5, f1 | |||
| FADD f2, f6, f2 | |||
| FADD f3, f7, f3 | |||
| FMUL f0, f0, f30 | |||
| FMUL f1, f1, f30 | |||
| FMUL f2, f2, f30 | |||
| @@ -1916,8 +1894,8 @@ LL(60): | |||
| LL(62): | |||
| FMADD f0, f16, f20, f0 | |||
| FMADD f1, f16, f21, f1 | |||
| FMADD f2, f17, f22, f2 | |||
| FMADD f3, f17, f23, f3 | |||
| FMADD f0, f17, f22, f0 | |||
| FMADD f1, f17, f23, f1 | |||
| LFD f20, 8 * SIZE(BO) | |||
| LFD f21, 9 * SIZE(BO) | |||
| @@ -1926,8 +1904,8 @@ LL(62): | |||
| FMADD f0, f18, f24, f0 | |||
| FMADD f1, f18, f25, f1 | |||
| FMADD f2, f19, f26, f2 | |||
| FMADD f3, f19, f27, f3 | |||
| FMADD f0, f19, f26, f0 | |||
| FMADD f1, f19, f27, f1 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -1986,15 +1964,9 @@ LL(68): | |||
| LFD f16, 0 * SIZE(CO1) | |||
| LFD f18, 0 * SIZE(CO2) | |||
| FADD f0, f2, f0 | |||
| FADD f1, f3, f1 | |||
| FMADD f0, f0, f30, f16 | |||
| FMADD f1, f1, f30, f18 | |||
| #else | |||
| FADD f0, f2, f0 | |||
| FADD f1, f3, f1 | |||
| FMUL f0, f0, f30 | |||
| FMUL f1, f1, f30 | |||
| #endif | |||
| @@ -2007,7 +1979,6 @@ LL(68): | |||
| fmr f4, f0 | |||
| fmr f5, f0 | |||
| #ifdef TRMMKERNEL | |||
| #if ( defined(LEFT) && defined(TRANSA)) || \ | |||
| (!defined(LEFT) && !defined(TRANSA)) | |||
| @@ -2332,8 +2303,8 @@ LL(80): | |||
| LL(82): | |||
| FMADD f0, f16, f20, f0 | |||
| FMADD f1, f17, f20, f1 | |||
| FMADD f2, f18, f21, f2 | |||
| FMADD f3, f19, f21, f3 | |||
| FMADD f0, f18, f21, f0 | |||
| FMADD f1, f19, f21, f1 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -2342,8 +2313,8 @@ LL(82): | |||
| FMADD f0, f16, f22, f0 | |||
| FMADD f1, f17, f22, f1 | |||
| FMADD f2, f18, f23, f2 | |||
| FMADD f3, f19, f23, f3 | |||
| FMADD f0, f18, f23, f0 | |||
| FMADD f1, f19, f23, f1 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -2401,15 +2372,9 @@ LL(88): | |||
| LFD f16, 0 * SIZE(CO1) | |||
| LFD f17, 1 * SIZE(CO1) | |||
| FADD f0, f2, f0 | |||
| FADD f1, f3, f1 | |||
| FMADD f0, f0, f30, f16 | |||
| FMADD f1, f1, f30, f17 | |||
| #else | |||
| FADD f0, f2, f0 | |||
| FADD f1, f3, f1 | |||
| FMUL f0, f0, f30 | |||
| FMUL f1, f1, f30 | |||
| #endif | |||
| @@ -2418,9 +2383,6 @@ LL(88): | |||
| STFD f1, 1 * SIZE(CO1) | |||
| lfs f0, FZERO | |||
| fmr f1, f0 | |||
| fmr f2, f0 | |||
| fmr f3, f0 | |||
| addi CO1, CO1, 2 * SIZE | |||
| @@ -2512,9 +2474,9 @@ LL(90): | |||
| LL(92): | |||
| FMADD f0, f16, f20, f0 | |||
| FMADD f1, f17, f21, f1 | |||
| FMADD f2, f18, f22, f2 | |||
| FMADD f3, f19, f23, f3 | |||
| FMADD f0, f17, f21, f0 | |||
| FMADD f0, f18, f22, f0 | |||
| FMADD f0, f19, f23, f0 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -2527,9 +2489,9 @@ LL(92): | |||
| LFD f23, 7 * SIZE(BO) | |||
| FMADD f0, f16, f20, f0 | |||
| FMADD f1, f17, f21, f1 | |||
| FMADD f2, f18, f22, f2 | |||
| FMADD f3, f19, f23, f3 | |||
| FMADD f0, f17, f21, f0 | |||
| FMADD f0, f18, f22, f0 | |||
| FMADD f0, f19, f23, f0 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -2583,16 +2545,8 @@ LL(98): | |||
| #ifndef TRMMKERNEL | |||
| LFD f16, 0 * SIZE(CO1) | |||
| FADD f0, f1, f0 | |||
| FADD f2, f3, f2 | |||
| FADD f0, f2, f0 | |||
| FMADD f0, f0, f30, f16 | |||
| #else | |||
| FADD f0, f1, f0 | |||
| FADD f2, f3, f2 | |||
| FADD f0, f2, f0 | |||
| FMUL f0, f0, f30 | |||
| #endif | |||
| @@ -47,9 +47,11 @@ | |||
| #ifndef __64BIT__ | |||
| #define X r6 | |||
| #define INCX r7 | |||
| #define FLAG r11 | |||
| #else | |||
| #define X r7 | |||
| #define INCX r8 | |||
| #define FLAG r12 | |||
| #endif | |||
| #endif | |||
| @@ -57,9 +59,11 @@ | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| #define X r8 | |||
| #define INCX r9 | |||
| #define FLAG r13 | |||
| #else | |||
| #define X r7 | |||
| #define INCX r8 | |||
| #define FLAG r12 | |||
| #endif | |||
| #endif | |||
| @@ -84,9 +88,12 @@ | |||
| cmpwi cr0, N, 0 | |||
| blelr- cr0 | |||
| // fcmpu cr0, FZERO, ALPHA | |||
| // bne- cr0, LL(A1I1) | |||
| b LL(A1I1) | |||
| fcmpu cr0, FZERO, ALPHA | |||
| bne- cr0, LL(A1I1) | |||
| ld FLAG, 48+64+8(SP) | |||
| cmpwi cr0, FLAG, 1 | |||
| beq- cr0, LL(A1I1) | |||
| cmpwi cr0, INCX, SIZE | |||
| bne- cr0, LL(A0IN) | |||
| @@ -74,7 +74,24 @@ static void sscal_kernel_16_zero( BLASLONG n, FLOAT *x ) | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| if (isfinite(x[0])) | |||
| x[0] = alpha; | |||
| x[1] = alpha; | |||
| x[2] = alpha; | |||
| x[3] = alpha; | |||
| x[4] = alpha; | |||
| x[5] = alpha; | |||
| x[6] = alpha; | |||
| x[7] = alpha; | |||
| x[8] = alpha; | |||
| x[9] = alpha; | |||
| x[10] = alpha; | |||
| x[11] = alpha; | |||
| x[12] = alpha; | |||
| x[13] = alpha; | |||
| x[14] = alpha; | |||
| x[15] = alpha; | |||
| #if 0 | |||
| if (isfinite(x[0])) | |||
| x[0] = alpha; | |||
| else | |||
| x[0] = NAN; | |||
| @@ -107,7 +124,8 @@ static void sscal_kernel_16_zero( BLASLONG n, FLOAT *x ) | |||
| else | |||
| x[7] = NAN; | |||
| x+=8; | |||
| } | |||
| #endif | |||
| } | |||
| } | |||
| @@ -132,6 +150,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| if (dummy2 == 0) | |||
| for (j = 0; j < align; j++){ | |||
| x[j] = 0.0; | |||
| } | |||
| else | |||
| for (j = 0; j < align; j++) { | |||
| if (isfinite(x[j])) | |||
| x[j] = 0.0; | |||
| @@ -153,9 +176,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| j=n1; | |||
| } | |||
| #endif | |||
| if (dummy2 == 0) | |||
| while(j < n) | |||
| { | |||
| x[j] = 0.0; | |||
| j++; | |||
| } | |||
| else | |||
| while(j < n) | |||
| { | |||
| if (isfinite(x[j])) | |||
| x[j]=0.0; | |||
| else | |||
| @@ -204,7 +233,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( da == 0.0 ) | |||
| { | |||
| if (dummy2 == 0) | |||
| while(j < n) | |||
| { | |||
| x[i]=0.0; | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| else | |||
| while(j < n) | |||
| { | |||
| if (isfinite(x[i])) | |||
| @@ -1159,9 +1159,9 @@ LL(20): | |||
| LL(22): | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| LFD f28, 4 * SIZE(AO) | |||
| LFD f29, 5 * SIZE(AO) | |||
| @@ -1169,9 +1169,9 @@ LL(22): | |||
| LFD f31, 7 * SIZE(AO) | |||
| FMA1 f4, f16, f22, f4 | |||
| FMA4 f7, f17, f22, f7 | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA3 f6, f17, f23, f6 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA3 f4, f17, f23, f4 | |||
| LFD f20, 8 * SIZE(BO) | |||
| LFD f21, 9 * SIZE(BO) | |||
| @@ -1179,14 +1179,14 @@ LL(22): | |||
| LFD f23, 11 * SIZE(BO) | |||
| FMA1 f8, f16, f24, f8 | |||
| FMA4 f11, f17, f24, f11 | |||
| FMA2 f9, f16, f25, f9 | |||
| FMA3 f10, f17, f25, f10 | |||
| FMA4 f9, f17, f24, f9 | |||
| FMA3 f8, f17, f25, f8 | |||
| FMA1 f12, f16, f26, f12 | |||
| FMA4 f15, f17, f26, f15 | |||
| FMA2 f13, f16, f27, f13 | |||
| FMA3 f14, f17, f27, f14 | |||
| FMA4 f13, f17, f26, f13 | |||
| FMA3 f12, f17, f27, f12 | |||
| LFD f24, 12 * SIZE(BO) | |||
| LFD f25, 13 * SIZE(BO) | |||
| @@ -1194,14 +1194,14 @@ LL(22): | |||
| LFD f27, 15 * SIZE(BO) | |||
| FMA1 f0, f18, f20, f0 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA2 f1, f18, f21, f1 | |||
| FMA3 f2, f19, f21, f2 | |||
| FMA4 f1, f19, f20, f1 | |||
| FMA3 f0, f19, f21, f0 | |||
| FMA1 f4, f18, f22, f4 | |||
| FMA4 f7, f19, f22, f7 | |||
| FMA2 f5, f18, f23, f5 | |||
| FMA3 f6, f19, f23, f6 | |||
| FMA4 f5, f19, f22, f5 | |||
| FMA3 f4, f19, f23, f4 | |||
| LFD f20, 16 * SIZE(BO) | |||
| LFD f21, 17 * SIZE(BO) | |||
| @@ -1209,14 +1209,14 @@ LL(22): | |||
| LFD f23, 19 * SIZE(BO) | |||
| FMA1 f8, f18, f24, f8 | |||
| FMA4 f11, f19, f24, f11 | |||
| FMA2 f9, f18, f25, f9 | |||
| FMA3 f10, f19, f25, f10 | |||
| FMA4 f9, f19, f24, f9 | |||
| FMA3 f8, f19, f25, f8 | |||
| FMA1 f12, f18, f26, f12 | |||
| FMA4 f15, f19, f26, f15 | |||
| FMA2 f13, f18, f27, f13 | |||
| FMA3 f14, f19, f27, f14 | |||
| FMA4 f13, f19, f26, f13 | |||
| FMA3 f12, f19, f27, f12 | |||
| LFD f24, 20 * SIZE(BO) | |||
| LFD f25, 21 * SIZE(BO) | |||
| @@ -1224,9 +1224,9 @@ LL(22): | |||
| LFD f27, 23 * SIZE(BO) | |||
| FMA1 f0, f28, f20, f0 | |||
| FMA4 f3, f29, f20, f3 | |||
| FMA2 f1, f28, f21, f1 | |||
| FMA3 f2, f29, f21, f2 | |||
| FMA4 f1, f29, f20, f1 | |||
| FMA3 f0, f29, f21, f0 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -1234,9 +1234,9 @@ LL(22): | |||
| LFD f19, 11 * SIZE(AO) | |||
| FMA1 f4, f28, f22, f4 | |||
| FMA4 f7, f29, f22, f7 | |||
| FMA2 f5, f28, f23, f5 | |||
| FMA3 f6, f29, f23, f6 | |||
| FMA4 f5, f29, f22, f5 | |||
| FMA3 f4, f29, f23, f4 | |||
| LFD f20, 24 * SIZE(BO) | |||
| LFD f21, 25 * SIZE(BO) | |||
| @@ -1244,14 +1244,14 @@ LL(22): | |||
| LFD f23, 27 * SIZE(BO) | |||
| FMA1 f8, f28, f24, f8 | |||
| FMA4 f11, f29, f24, f11 | |||
| FMA2 f9, f28, f25, f9 | |||
| FMA3 f10, f29, f25, f10 | |||
| FMA4 f9, f29, f24, f9 | |||
| FMA3 f8, f29, f25, f8 | |||
| FMA1 f12, f28, f26, f12 | |||
| FMA4 f15, f29, f26, f15 | |||
| FMA2 f13, f28, f27, f13 | |||
| FMA3 f14, f29, f27, f14 | |||
| FMA4 f13, f29, f26, f13 | |||
| FMA3 f12, f29, f27, f12 | |||
| LFD f24, 28 * SIZE(BO) | |||
| LFD f25, 29 * SIZE(BO) | |||
| @@ -1259,14 +1259,14 @@ LL(22): | |||
| LFD f27, 31 * SIZE(BO) | |||
| FMA1 f0, f30, f20, f0 | |||
| FMA4 f3, f31, f20, f3 | |||
| FMA2 f1, f30, f21, f1 | |||
| FMA3 f2, f31, f21, f2 | |||
| FMA4 f1, f31, f20, f1 | |||
| FMA3 f0, f31, f21, f0 | |||
| FMA1 f4, f30, f22, f4 | |||
| FMA4 f7, f31, f22, f7 | |||
| FMA2 f5, f30, f23, f5 | |||
| FMA3 f6, f31, f23, f6 | |||
| FMA4 f5, f31, f22, f5 | |||
| FMA3 f4, f31, f23, f4 | |||
| LFD f20, 32 * SIZE(BO) | |||
| LFD f21, 33 * SIZE(BO) | |||
| @@ -1274,14 +1274,14 @@ LL(22): | |||
| LFD f23, 35 * SIZE(BO) | |||
| FMA1 f8, f30, f24, f8 | |||
| FMA4 f11, f31, f24, f11 | |||
| FMA2 f9, f30, f25, f9 | |||
| FMA3 f10, f31, f25, f10 | |||
| FMA4 f9, f31, f24, f9 | |||
| FMA3 f8, f31, f25, f8 | |||
| FMA1 f12, f30, f26, f12 | |||
| FMA4 f15, f31, f26, f15 | |||
| FMA2 f13, f30, f27, f13 | |||
| FMA3 f14, f31, f27, f14 | |||
| FMA4 f13, f31, f26, f13 | |||
| FMA3 f12, f31, f27, f12 | |||
| LFD f24, 36 * SIZE(BO) | |||
| LFD f25, 37 * SIZE(BO) | |||
| @@ -1318,14 +1318,14 @@ LL(25): | |||
| LL(26): | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA1 f4, f16, f22, f4 | |||
| FMA4 f7, f17, f22, f7 | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA3 f6, f17, f23, f6 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA3 f4, f17, f23, f4 | |||
| LFD f20, 8 * SIZE(BO) | |||
| LFD f21, 9 * SIZE(BO) | |||
| @@ -1333,14 +1333,14 @@ LL(26): | |||
| LFD f23, 11 * SIZE(BO) | |||
| FMA1 f8, f16, f24, f8 | |||
| FMA4 f11, f17, f24, f11 | |||
| FMA2 f9, f16, f25, f9 | |||
| FMA3 f10, f17, f25, f10 | |||
| FMA4 f9, f17, f24, f9 | |||
| FMA3 f8, f17, f25, f8 | |||
| FMA1 f12, f16, f26, f12 | |||
| FMA4 f15, f17, f26, f15 | |||
| FMA2 f13, f16, f27, f13 | |||
| FMA3 f14, f17, f27, f14 | |||
| FMA4 f13, f17, f26, f13 | |||
| FMA3 f12, f17, f27, f12 | |||
| LFD f16, 2 * SIZE(AO) | |||
| LFD f17, 3 * SIZE(AO) | |||
| @@ -1363,47 +1363,42 @@ LL(28): | |||
| LFD f18, 0 * SIZE(CO2) | |||
| LFD f19, 1 * SIZE(CO2) | |||
| FADD f0, f0, f2 | |||
| FADD f1, f1, f3 | |||
| FADD f4, f4, f6 | |||
| FADD f5, f5, f7 | |||
| LFD f20, 0 * SIZE(CO3) | |||
| LFD f21, 1 * SIZE(CO3) | |||
| LFD f22, 0 * SIZE(CO4) | |||
| LFD f23, 1 * SIZE(CO4) | |||
| FADD f8, f8, f10 | |||
| FADD f9, f9, f11 | |||
| FADD f12, f12, f14 | |||
| FADD f13, f13, f15 | |||
| fmr f2, f0 | |||
| fmr f3, f1 | |||
| fmr f6, f4 | |||
| fmr f7, f5 | |||
| FNMSUB f24, f31, f1, f16 | |||
| FMADD f25, f31, f0, f17 | |||
| FNMSUB f26, f31, f5, f18 | |||
| FMADD f27, f31, f4, f19 | |||
| FMADD f24, f30, f0, f16 | |||
| FMADD f25, f30, f1, f17 | |||
| FMADD f26, f30, f4, f18 | |||
| FMADD f27, f30, f5, f19 | |||
| FMADD f0, f30, f0, f24 | |||
| FMADD f1, f30, f1, f25 | |||
| FMADD f4, f30, f4, f26 | |||
| FMADD f5, f30, f5, f27 | |||
| FNMSUB f0, f31, f3, f24 | |||
| FMADD f1, f31, f2, f25 | |||
| FNMSUB f4, f31, f7, f26 | |||
| FMADD f5, f31, f6, f27 | |||
| FNMSUB f24, f31, f9, f20 | |||
| FMADD f25, f31, f8, f21 | |||
| FNMSUB f26, f31, f13, f22 | |||
| FMADD f27, f31, f12, f23 | |||
| fmr f10, f8 | |||
| fmr f11, f9 | |||
| fmr f14, f12 | |||
| fmr f15, f13 | |||
| FMADD f8, f30, f8, f24 | |||
| FMADD f9, f30, f9, f25 | |||
| FMADD f12, f30, f12, f26 | |||
| FMADD f13, f30, f13, f27 | |||
| FMADD f24, f30, f8, f20 | |||
| FMADD f25, f30, f9, f21 | |||
| FMADD f26, f30, f12, f22 | |||
| FMADD f27, f30, f13, f23 | |||
| #else | |||
| FADD f0, f0, f2 | |||
| FADD f1, f1, f3 | |||
| FADD f4, f4, f6 | |||
| FADD f5, f5, f7 | |||
| FNMSUB f8, f31, f11, f24 | |||
| FMADD f9, f31, f10, f25 | |||
| FNMSUB f12, f31, f15, f26 | |||
| FMADD f13, f31, f14, f27 | |||
| #else | |||
| FMUL f16, f31, f1 | |||
| FMUL f17, f31, f0 | |||
| FMUL f18, f31, f5 | |||
| @@ -1414,11 +1409,6 @@ LL(28): | |||
| FMSUB f4, f30, f4, f18 | |||
| FMADD f5, f30, f5, f19 | |||
| FADD f8, f8, f10 | |||
| FADD f9, f9, f11 | |||
| FADD f12, f12, f14 | |||
| FADD f13, f13, f15 | |||
| FMUL f20, f31, f9 | |||
| FMUL f21, f31, f8 | |||
| FMUL f22, f31, f13 | |||
| @@ -1616,15 +1606,15 @@ LL(32): | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA2 f7, f18, f23, f7 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| FMA4 f13, f17, f22, f13 | |||
| FMA4 f15, f19, f22, f15 | |||
| FMA3 f12, f17, f23, f12 | |||
| FMA3 f14, f19, f23, f14 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA4 f7, f19, f22, f7 | |||
| FMA3 f4, f17, f23, f4 | |||
| FMA3 f6, f19, f23, f6 | |||
| LFD f20, 8 * SIZE(BO) | |||
| LFD f21, 9 * SIZE(BO) | |||
| @@ -1646,15 +1636,15 @@ LL(32): | |||
| FMA2 f5, f28, f27, f5 | |||
| FMA2 f7, f30, f27, f7 | |||
| FMA4 f9, f29, f24, f9 | |||
| FMA4 f11, f31, f24, f11 | |||
| FMA3 f8, f29, f25, f8 | |||
| FMA3 f10, f31, f25, f10 | |||
| FMA4 f1, f29, f24, f1 | |||
| FMA4 f3, f31, f24, f3 | |||
| FMA3 f0, f29, f25, f0 | |||
| FMA3 f2, f31, f25, f2 | |||
| FMA4 f13, f29, f26, f13 | |||
| FMA4 f15, f31, f26, f15 | |||
| FMA3 f12, f29, f27, f12 | |||
| FMA3 f14, f31, f27, f14 | |||
| FMA4 f5, f29, f26, f5 | |||
| FMA4 f7, f31, f26, f7 | |||
| FMA3 f4, f29, f27, f4 | |||
| FMA3 f6, f31, f27, f6 | |||
| LFD f24, 12 * SIZE(BO) | |||
| LFD f25, 13 * SIZE(BO) | |||
| @@ -1676,15 +1666,15 @@ LL(32): | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA2 f7, f18, f23, f7 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| FMA4 f13, f17, f22, f13 | |||
| FMA4 f15, f19, f22, f15 | |||
| FMA3 f12, f17, f23, f12 | |||
| FMA3 f14, f19, f23, f14 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA4 f7, f19, f22, f7 | |||
| FMA3 f4, f17, f23, f4 | |||
| FMA3 f6, f19, f23, f6 | |||
| LFD f20, 16 * SIZE(BO) | |||
| LFD f21, 17 * SIZE(BO) | |||
| @@ -1706,15 +1696,15 @@ LL(32): | |||
| FMA2 f5, f28, f27, f5 | |||
| FMA2 f7, f30, f27, f7 | |||
| FMA4 f9, f29, f24, f9 | |||
| FMA4 f11, f31, f24, f11 | |||
| FMA3 f8, f29, f25, f8 | |||
| FMA3 f10, f31, f25, f10 | |||
| FMA4 f1, f29, f24, f1 | |||
| FMA4 f3, f31, f24, f3 | |||
| FMA3 f0, f29, f25, f0 | |||
| FMA3 f2, f31, f25, f2 | |||
| FMA4 f13, f29, f26, f13 | |||
| FMA4 f15, f31, f26, f15 | |||
| FMA3 f12, f29, f27, f12 | |||
| FMA3 f14, f31, f27, f14 | |||
| FMA4 f5, f29, f26, f5 | |||
| FMA4 f7, f31, f26, f7 | |||
| FMA3 f4, f29, f27, f4 | |||
| FMA3 f6, f31, f27, f6 | |||
| LFD f24, 20 * SIZE(BO) | |||
| LFD f25, 21 * SIZE(BO) | |||
| @@ -1736,15 +1726,15 @@ LL(32): | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA2 f7, f18, f23, f7 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| FMA4 f13, f17, f22, f13 | |||
| FMA4 f15, f19, f22, f15 | |||
| FMA3 f12, f17, f23, f12 | |||
| FMA3 f14, f19, f23, f14 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA4 f7, f19, f22, f7 | |||
| FMA3 f4, f17, f23, f4 | |||
| FMA3 f6, f19, f23, f6 | |||
| LFD f20, 24 * SIZE(BO) | |||
| LFD f21, 25 * SIZE(BO) | |||
| @@ -1766,15 +1756,15 @@ LL(32): | |||
| FMA2 f5, f28, f27, f5 | |||
| FMA2 f7, f30, f27, f7 | |||
| FMA4 f9, f29, f24, f9 | |||
| FMA4 f11, f31, f24, f11 | |||
| FMA3 f8, f29, f25, f8 | |||
| FMA3 f10, f31, f25, f10 | |||
| FMA4 f1, f29, f24, f1 | |||
| FMA4 f3, f31, f24, f3 | |||
| FMA3 f0, f29, f25, f0 | |||
| FMA3 f2, f31, f25, f2 | |||
| FMA4 f13, f29, f26, f13 | |||
| FMA4 f15, f31, f26, f15 | |||
| FMA3 f12, f29, f27, f12 | |||
| FMA3 f14, f31, f27, f14 | |||
| FMA4 f5, f29, f26, f5 | |||
| FMA4 f7, f31, f26, f7 | |||
| FMA3 f4, f29, f27, f4 | |||
| FMA3 f6, f31, f27, f6 | |||
| LFD f24, 28 * SIZE(BO) | |||
| LFD f25, 29 * SIZE(BO) | |||
| @@ -1796,15 +1786,15 @@ LL(32): | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA2 f7, f18, f23, f7 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| FMA4 f13, f17, f22, f13 | |||
| FMA4 f15, f19, f22, f15 | |||
| FMA3 f12, f17, f23, f12 | |||
| FMA3 f14, f19, f23, f14 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA4 f7, f19, f22, f7 | |||
| FMA3 f4, f17, f23, f4 | |||
| FMA3 f6, f19, f23, f6 | |||
| LFD f20, 32 * SIZE(BO) | |||
| LFD f21, 33 * SIZE(BO) | |||
| @@ -1826,15 +1816,15 @@ LL(32): | |||
| FMA2 f5, f28, f27, f5 | |||
| FMA2 f7, f30, f27, f7 | |||
| FMA4 f9, f29, f24, f9 | |||
| FMA4 f11, f31, f24, f11 | |||
| FMA3 f8, f29, f25, f8 | |||
| FMA3 f10, f31, f25, f10 | |||
| FMA4 f1, f29, f24, f1 | |||
| FMA4 f3, f31, f24, f3 | |||
| FMA3 f0, f29, f25, f0 | |||
| FMA3 f2, f31, f25, f2 | |||
| FMA4 f13, f29, f26, f13 | |||
| FMA4 f15, f31, f26, f15 | |||
| FMA3 f12, f29, f27, f12 | |||
| FMA3 f14, f31, f27, f14 | |||
| FMA4 f5, f29, f26, f5 | |||
| FMA4 f7, f31, f26, f7 | |||
| FMA3 f4, f29, f27, f4 | |||
| FMA3 f6, f31, f27, f6 | |||
| LFD f24, 36 * SIZE(BO) | |||
| LFD f25, 37 * SIZE(BO) | |||
| @@ -1883,20 +1873,20 @@ LL(36): | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA2 f7, f18, f23, f7 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f18, 6 * SIZE(AO) | |||
| LFD f20, 4 * SIZE(BO) | |||
| LFD f21, 5 * SIZE(BO) | |||
| FMA4 f13, f17, f22, f13 | |||
| FMA4 f15, f19, f22, f15 | |||
| FMA3 f12, f17, f23, f12 | |||
| FMA3 f14, f19, f23, f14 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA4 f7, f19, f22, f7 | |||
| FMA3 f4, f17, f23, f4 | |||
| FMA3 f6, f19, f23, f6 | |||
| LFD f17, 5 * SIZE(AO) | |||
| LFD f19, 7 * SIZE(AO) | |||
| @@ -1916,52 +1906,42 @@ LL(38): | |||
| LFD f18, 2 * SIZE(CO1) | |||
| LFD f19, 3 * SIZE(CO1) | |||
| FADD f0, f0, f8 | |||
| FADD f1, f1, f9 | |||
| FADD f2, f2, f10 | |||
| FADD f3, f3, f11 | |||
| LFD f20, 0 * SIZE(CO2) | |||
| LFD f21, 1 * SIZE(CO2) | |||
| LFD f22, 2 * SIZE(CO2) | |||
| LFD f23, 3 * SIZE(CO2) | |||
| FADD f4, f4, f12 | |||
| FADD f5, f5, f13 | |||
| FADD f6, f6, f14 | |||
| FADD f7, f7, f15 | |||
| fmr f8, f0 | |||
| fmr f9, f1 | |||
| fmr f10, f2 | |||
| fmr f11, f3 | |||
| FNMSUB f24, f31, f1, f16 | |||
| FMADD f25, f31, f0, f17 | |||
| FNMSUB f26, f31, f3, f18 | |||
| FMADD f27, f31, f2, f19 | |||
| FMADD f24, f30, f0, f16 | |||
| FMADD f25, f30, f1, f17 | |||
| FMADD f26, f30, f2, f18 | |||
| FMADD f27, f30, f3, f19 | |||
| FMADD f0, f30, f0, f24 | |||
| FMADD f1, f30, f1, f25 | |||
| FMADD f2, f30, f2, f26 | |||
| FMADD f3, f30, f3, f27 | |||
| FNMSUB f0, f31, f9, f24 | |||
| FMADD f1, f31, f8, f25 | |||
| FNMSUB f2, f31, f11, f26 | |||
| FMADD f3, f31, f10, f27 | |||
| FNMSUB f24, f31, f5, f20 | |||
| FMADD f25, f31, f4, f21 | |||
| FNMSUB f26, f31, f7, f22 | |||
| FMADD f27, f31, f6, f23 | |||
| fmr f12, f4 | |||
| fmr f13, f5 | |||
| fmr f14, f6 | |||
| fmr f15, f7 | |||
| FMADD f4, f30, f4, f24 | |||
| FMADD f5, f30, f5, f25 | |||
| FMADD f6, f30, f6, f26 | |||
| FMADD f7, f30, f7, f27 | |||
| FMADD f24, f30, f4, f20 | |||
| FMADD f25, f30, f5, f21 | |||
| FMADD f26, f30, f6, f22 | |||
| FMADD f27, f30, f7, f23 | |||
| #else | |||
| FADD f0, f0, f8 | |||
| FADD f1, f1, f9 | |||
| FADD f2, f2, f10 | |||
| FADD f3, f3, f11 | |||
| FADD f4, f4, f12 | |||
| FADD f5, f5, f13 | |||
| FADD f6, f6, f14 | |||
| FADD f7, f7, f15 | |||
| FNMSUB f4, f31, f13, f24 | |||
| FMADD f5, f31, f12, f25 | |||
| FNMSUB f6, f31, f15, f26 | |||
| FMADD f7, f31, f14, f27 | |||
| #else | |||
| FMUL f16, f31, f1 | |||
| FMUL f17, f31, f0 | |||
| FMUL f18, f31, f3 | |||
| @@ -2101,14 +2081,14 @@ LL(40): | |||
| LL(42): | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA1 f4, f16, f22, f4 | |||
| FMA4 f7, f17, f22, f7 | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA3 f6, f17, f23, f6 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA3 f4, f17, f23, f4 | |||
| LFD f16, 2 * SIZE(AO) | |||
| LFD f17, 3 * SIZE(AO) | |||
| @@ -2119,14 +2099,14 @@ LL(42): | |||
| LFD f23, 7 * SIZE(BO) | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA1 f4, f16, f22, f4 | |||
| FMA4 f7, f17, f22, f7 | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA3 f6, f17, f23, f6 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA3 f4, f17, f23, f4 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -2137,14 +2117,14 @@ LL(42): | |||
| LFD f23, 11 * SIZE(BO) | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA1 f4, f16, f22, f4 | |||
| FMA4 f7, f17, f22, f7 | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA3 f6, f17, f23, f6 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA3 f4, f17, f23, f4 | |||
| LFD f16, 6 * SIZE(AO) | |||
| LFD f17, 7 * SIZE(AO) | |||
| @@ -2155,14 +2135,14 @@ LL(42): | |||
| LFD f23, 15 * SIZE(BO) | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA1 f4, f16, f22, f4 | |||
| FMA4 f7, f17, f22, f7 | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA3 f6, f17, f23, f6 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA3 f4, f17, f23, f4 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -2202,14 +2182,14 @@ LL(45): | |||
| LL(46): | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA1 f4, f16, f22, f4 | |||
| FMA4 f7, f17, f22, f7 | |||
| FMA2 f5, f16, f23, f5 | |||
| FMA3 f6, f17, f23, f6 | |||
| FMA4 f5, f17, f22, f5 | |||
| FMA3 f4, f17, f23, f4 | |||
| LFD f16, 2 * SIZE(AO) | |||
| LFD f17, 3 * SIZE(AO) | |||
| @@ -2231,27 +2211,22 @@ LL(48): | |||
| LFD f20, 0 * SIZE(CO2) | |||
| LFD f21, 1 * SIZE(CO2) | |||
| FADD f0, f0, f2 | |||
| FADD f1, f1, f3 | |||
| FADD f4, f4, f6 | |||
| FADD f5, f5, f7 | |||
| fmr f2, f0 | |||
| fmr f3, f1 | |||
| fmr f6, f4 | |||
| fmr f7, f5 | |||
| FNMSUB f24, f31, f1, f16 | |||
| FMADD f25, f31, f0, f17 | |||
| FNMSUB f26, f31, f5, f20 | |||
| FMADD f27, f31, f4, f21 | |||
| FMADD f24, f30, f0, f16 | |||
| FMADD f25, f30, f1, f17 | |||
| FMADD f26, f30, f4, f20 | |||
| FMADD f27, f30, f5, f21 | |||
| FMADD f0, f30, f0, f24 | |||
| FMADD f1, f30, f1, f25 | |||
| FMADD f4, f30, f4, f26 | |||
| FMADD f5, f30, f5, f27 | |||
| FNMSUB f0, f31, f3, f24 | |||
| FMADD f1, f31, f2, f25 | |||
| FNMSUB f4, f31, f7, f26 | |||
| FMADD f5, f31, f6, f27 | |||
| #else | |||
| FADD f0, f0, f2 | |||
| FADD f1, f1, f3 | |||
| FADD f4, f4, f6 | |||
| FADD f5, f5, f7 | |||
| FMUL f16, f31, f1 | |||
| FMUL f17, f31, f0 | |||
| FMUL f18, f31, f5 | |||
| @@ -2401,10 +2376,10 @@ LL(52): | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA2 f3, f18, f21, f3 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -2416,10 +2391,10 @@ LL(52): | |||
| FMA2 f1, f16, f23, f1 | |||
| FMA2 f3, f18, f23, f3 | |||
| FMA4 f9, f17, f22, f9 | |||
| FMA4 f11, f19, f22, f11 | |||
| FMA3 f8, f17, f23, f8 | |||
| FMA3 f10, f19, f23, f10 | |||
| FMA4 f1, f17, f22, f1 | |||
| FMA4 f3, f19, f22, f3 | |||
| FMA3 f0, f17, f23, f0 | |||
| FMA3 f2, f19, f23, f2 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -2436,10 +2411,10 @@ LL(52): | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA2 f3, f18, f21, f3 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| LFD f16, 12 * SIZE(AO) | |||
| LFD f17, 13 * SIZE(AO) | |||
| @@ -2451,10 +2426,10 @@ LL(52): | |||
| FMA2 f1, f16, f23, f1 | |||
| FMA2 f3, f18, f23, f3 | |||
| FMA4 f9, f17, f22, f9 | |||
| FMA4 f11, f19, f22, f11 | |||
| FMA3 f8, f17, f23, f8 | |||
| FMA3 f10, f19, f23, f10 | |||
| FMA4 f1, f17, f22, f1 | |||
| FMA4 f3, f19, f22, f3 | |||
| FMA3 f0, f17, f23, f0 | |||
| FMA3 f2, f19, f23, f2 | |||
| LFD f16, 16 * SIZE(AO) | |||
| LFD f17, 17 * SIZE(AO) | |||
| @@ -2471,10 +2446,10 @@ LL(52): | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA2 f3, f18, f21, f3 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| LFD f16, 20 * SIZE(AO) | |||
| LFD f17, 21 * SIZE(AO) | |||
| @@ -2486,10 +2461,10 @@ LL(52): | |||
| FMA2 f1, f16, f23, f1 | |||
| FMA2 f3, f18, f23, f3 | |||
| FMA4 f9, f17, f22, f9 | |||
| FMA4 f11, f19, f22, f11 | |||
| FMA3 f8, f17, f23, f8 | |||
| FMA3 f10, f19, f23, f10 | |||
| FMA4 f1, f17, f22, f1 | |||
| FMA4 f3, f19, f22, f3 | |||
| FMA3 f0, f17, f23, f0 | |||
| FMA3 f2, f19, f23, f2 | |||
| LFD f16, 24 * SIZE(AO) | |||
| LFD f17, 25 * SIZE(AO) | |||
| @@ -2506,10 +2481,10 @@ LL(52): | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA2 f3, f18, f21, f3 | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| LFD f16, 28 * SIZE(AO) | |||
| LFD f17, 29 * SIZE(AO) | |||
| @@ -2521,10 +2496,10 @@ LL(52): | |||
| FMA2 f1, f16, f23, f1 | |||
| FMA2 f3, f18, f23, f3 | |||
| FMA4 f9, f17, f22, f9 | |||
| FMA4 f11, f19, f22, f11 | |||
| FMA3 f8, f17, f23, f8 | |||
| FMA3 f10, f19, f23, f10 | |||
| FMA4 f1, f17, f22, f1 | |||
| FMA4 f3, f19, f22, f3 | |||
| FMA3 f0, f17, f23, f0 | |||
| FMA3 f2, f19, f23, f2 | |||
| LFD f16, 32 * SIZE(AO) | |||
| LFD f17, 33 * SIZE(AO) | |||
| @@ -2573,10 +2548,10 @@ LL(56): | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f18, 6 * SIZE(AO) | |||
| FMA4 f9, f17, f20, f9 | |||
| FMA4 f11, f19, f20, f11 | |||
| FMA3 f8, f17, f21, f8 | |||
| FMA3 f10, f19, f21, f10 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA4 f3, f19, f20, f3 | |||
| FMA3 f0, f17, f21, f0 | |||
| FMA3 f2, f19, f21, f2 | |||
| LFD f17, 5 * SIZE(AO) | |||
| LFD f19, 7 * SIZE(AO) | |||
| @@ -2595,27 +2570,22 @@ LL(58): | |||
| LFD f18, 2 * SIZE(CO1) | |||
| LFD f19, 3 * SIZE(CO1) | |||
| FADD f0, f0, f8 | |||
| FADD f1, f1, f9 | |||
| FADD f2, f2, f10 | |||
| FADD f3, f3, f11 | |||
| fmr f8, f0 | |||
| fmr f9, f1 | |||
| fmr f10, f2 | |||
| fmr f11, f3 | |||
| FNMSUB f24, f31, f1, f16 | |||
| FMADD f25, f31, f0, f17 | |||
| FNMSUB f26, f31, f3, f18 | |||
| FMADD f27, f31, f2, f19 | |||
| FMADD f24, f30, f0, f16 | |||
| FMADD f25, f30, f1, f17 | |||
| FMADD f26, f30, f2, f18 | |||
| FMADD f27, f30, f3, f19 | |||
| FMADD f0, f30, f0, f24 | |||
| FMADD f1, f30, f1, f25 | |||
| FMADD f2, f30, f2, f26 | |||
| FMADD f3, f30, f3, f27 | |||
| FNMSUB f0, f31, f9, f24 | |||
| FMADD f1, f31, f8, f25 | |||
| FNMSUB f2, f31, f11, f26 | |||
| FMADD f3, f31, f10, f27 | |||
| #else | |||
| FADD f0, f0, f8 | |||
| FADD f1, f1, f9 | |||
| FADD f2, f2, f10 | |||
| FADD f3, f3, f11 | |||
| FMUL f16, f31, f1 | |||
| FMUL f17, f31, f0 | |||
| FMUL f18, f31, f3 | |||
| @@ -2735,9 +2705,9 @@ LL(60): | |||
| LL(62): | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| LFD f16, 4 * SIZE(AO) | |||
| LFD f17, 5 * SIZE(AO) | |||
| @@ -2745,9 +2715,9 @@ LL(62): | |||
| LFD f21, 5 * SIZE(BO) | |||
| FMA1 f0, f18, f22, f0 | |||
| FMA4 f3, f19, f22, f3 | |||
| FMA2 f1, f18, f23, f1 | |||
| FMA3 f2, f19, f23, f2 | |||
| FMA4 f1, f19, f22, f1 | |||
| FMA3 f0, f19, f23, f0 | |||
| LFD f18, 6 * SIZE(AO) | |||
| LFD f19, 7 * SIZE(AO) | |||
| @@ -2755,9 +2725,9 @@ LL(62): | |||
| LFD f23, 7 * SIZE(BO) | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| FMA2 f1, f16, f21, f1 | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| FMA3 f0, f17, f21, f0 | |||
| LFD f16, 8 * SIZE(AO) | |||
| LFD f17, 9 * SIZE(AO) | |||
| @@ -2765,9 +2735,9 @@ LL(62): | |||
| LFD f21, 9 * SIZE(BO) | |||
| FMA1 f0, f18, f22, f0 | |||
| FMA4 f3, f19, f22, f3 | |||
| FMA2 f1, f18, f23, f1 | |||
| FMA3 f2, f19, f23, f2 | |||
| FMA4 f1, f19, f22, f1 | |||
| FMA3 f0, f19, f23, f0 | |||
| LFD f18, 10 * SIZE(AO) | |||
| LFD f19, 11 * SIZE(AO) | |||
| @@ -2803,11 +2773,11 @@ LL(65): | |||
| LL(66): | |||
| FMA1 f0, f16, f20, f0 | |||
| FMA4 f3, f17, f20, f3 | |||
| LFD f20, 2 * SIZE(BO) | |||
| FMA2 f1, f16, f21, f1 | |||
| LFD f16, 2 * SIZE(AO) | |||
| FMA3 f2, f17, f21, f2 | |||
| FMA4 f1, f17, f20, f1 | |||
| LFD f20, 2 * SIZE(BO) | |||
| FMA3 f0, f17, f21, f0 | |||
| LFD f17, 3 * SIZE(AO) | |||
| LFD f21, 3 * SIZE(BO) | |||
| @@ -2821,20 +2791,17 @@ LL(68): | |||
| LFD f16, 0 * SIZE(CO1) | |||
| LFD f17, 1 * SIZE(CO1) | |||
| FADD f0, f0, f2 | |||
| FADD f1, f1, f3 | |||
| fmr f2, f0 | |||
| fmr f3, f1 | |||
| FNMSUB f24, f31, f1, f16 | |||
| FMADD f25, f31, f0, f17 | |||
| FMADD f24, f30, f0, f16 | |||
| FMADD f25, f30, f1, f17 | |||
| FMADD f0, f30, f0, f24 | |||
| FMADD f1, f30, f1, f25 | |||
| FNMSUB f0, f31, f3, f24 | |||
| FMADD f1, f31, f2, f25 | |||
| #else | |||
| FADD f0, f0, f2 | |||
| FADD f1, f1, f3 | |||
| FMUL f16, f31, f1 | |||
| FMUL f17, f31, f0 | |||
| @@ -99,26 +99,26 @@ ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| @@ -43,9 +43,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( (n <= 0) || (inc_x <= 0)) | |||
| return(0); | |||
| while(j < n) | |||
| { | |||
| if (dummy2 == 1) { | |||
| while(j < n) | |||
| { | |||
| if ( da == 0.0 ) | |||
| if (isfinite(x[i])) | |||
| @@ -57,7 +57,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } else { | |||
| while(j < n) | |||
| { | |||
| if ( da == 0.0 ) | |||
| x[i]=0.0; | |||
| else | |||
| x[i] = da * x[i] ; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| return 0; | |||
| @@ -56,7 +56,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| FLOAT_V_T v0; | |||
| if(inc_x == 1) { | |||
| if(da == 0.0) { | |||
| if(dummy2 == 0 && da == 0.0) { | |||
| int gvl = VSETVL_MAX; | |||
| v0 = VFMVVF_FLOAT(0.0, gvl); | |||
| for (size_t vl; n > 0; n -= vl, x += vl) { | |||
| @@ -75,7 +75,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } else { | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| if(da == 0.0) { | |||
| if(dummy2 == 0 && da == 0.0) { | |||
| int gvl = VSETVL_MAX; | |||
| v0 = VFMVVF_FLOAT(0.0, gvl); | |||
| for (size_t vl; n > 0; n -= vl, x += vl*inc_x) { | |||
| @@ -71,7 +71,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| FLOAT_V_T v0, v1; | |||
| unsigned int gvl = 0; | |||
| if(inc_x == 1){ | |||
| if (0){ //if(da == 0.0){ | |||
| if(dummy2 == 0 && da == 0.0){ | |||
| memset(&x[0], 0, n * sizeof(FLOAT)); | |||
| }else{ | |||
| gvl = VSETVL(n); | |||
| @@ -96,7 +96,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| } | |||
| } | |||
| }else{ | |||
| if (0) { //if(da == 0.0){ | |||
| if(dummy2 == 0 && da == 0.0){ | |||
| BLASLONG stride_x = inc_x * sizeof(FLOAT); | |||
| BLASLONG ix = 0; | |||
| gvl = VSETVL(n); | |||
| @@ -1244,6 +1244,36 @@ static void init_parameter(void) { | |||
| } | |||
| #else //ZARCH | |||
| #if (ARCH_RISCV64) | |||
| static void init_parameter(void) { | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; | |||
| #endif | |||
| TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; | |||
| TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; | |||
| TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; | |||
| TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; | |||
| #endif | |||
| TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; | |||
| TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; | |||
| TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; | |||
| TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; | |||
| #ifdef BUILD_BFLOAT16 | |||
| TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; | |||
| #endif | |||
| TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; | |||
| TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; | |||
| TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; | |||
| TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; | |||
| } | |||
| #else //RISCV64 | |||
| #ifdef ARCH_X86 | |||
| static int get_l2_size_old(void){ | |||
| int i, eax, ebx, ecx, edx, cpuid_level; | |||
| @@ -2046,6 +2076,7 @@ static void init_parameter(void) { | |||
| } | |||
| #endif //RISCV64 | |||
| #endif //POWER | |||
| #endif //ZARCH | |||
| #endif //(ARCH_LOONGARCH64) | |||
| @@ -57,19 +57,24 @@ | |||
| #ifdef XDOUBLE | |||
| movl 44(%esp),%edi | |||
| movl 48(%esp),%esi | |||
| movl 64(%esp),%ecx | |||
| #elif defined(DOUBLE) | |||
| movl 36(%esp),%edi | |||
| movl 40(%esp),%esi | |||
| movl 56(%esp),%ecx | |||
| #else | |||
| movl 32(%esp),%edi | |||
| movl 36(%esp),%esi | |||
| movl 52(%esp),%ecx | |||
| #endif | |||
| ftst | |||
| fnstsw %ax | |||
| andb $68, %ah | |||
| // je .L300 # Alpha != ZERO | |||
| jmp .L300 | |||
| je .L300 # Alpha != ZERO | |||
| cmpl $1,%ecx # dummy2 flag | |||
| je .L300 | |||
| /* Alpha == ZERO */ | |||
| cmpl $1,%esi | |||
| @@ -43,21 +43,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT alpha = *da; | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| x[0] *= alpha; | |||
| x[1] *= alpha; | |||
| x[2] *= alpha; | |||
| x[3] *= alpha; | |||
| x[4] *= alpha; | |||
| x[5] *= alpha; | |||
| x[6] *= alpha; | |||
| x[7] *= alpha; | |||
| x+=8; | |||
| } | |||
| BLASLONG i; | |||
| FLOAT alpha = *da; | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| x[0] *= alpha; | |||
| x[1] *= alpha; | |||
| x[2] *= alpha; | |||
| x[3] *= alpha; | |||
| x[4] *= alpha; | |||
| x[5] *= alpha; | |||
| x[6] *= alpha; | |||
| x[7] *= alpha; | |||
| x+=8; | |||
| } | |||
| } | |||
| @@ -65,19 +65,19 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) | |||
| static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) | |||
| { | |||
| BLASLONG i; | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| x[0] = 0.0; | |||
| x[1] = 0.0; | |||
| x[2] = 0.0; | |||
| x[3] = 0.0; | |||
| x[4] = 0.0; | |||
| x[5] = 0.0; | |||
| x[6] = 0.0; | |||
| x[7] = 0.0; | |||
| x+=8; | |||
| } | |||
| BLASLONG i; | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| x[0] = 0.0; | |||
| x[1] = 0.0; | |||
| x[2] = 0.0; | |||
| x[3] = 0.0; | |||
| x[4] = 0.0; | |||
| x[5] = 0.0; | |||
| x[6] = 0.0; | |||
| x[7] = 0.0; | |||
| x+=8; | |||
| } | |||
| } | |||
| @@ -89,51 +89,51 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ | |||
| static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| FLOAT *x1=NULL; | |||
| BLASLONG inc_x3; | |||
| FLOAT *x1=NULL; | |||
| BLASLONG inc_x3; | |||
| inc_x <<= 3; | |||
| inc_x3 = (inc_x << 1) + inc_x; | |||
| inc_x <<= 3; | |||
| inc_x3 = (inc_x << 1) + inc_x; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movddup (%3), %%xmm0 \n\t" // alpha | |||
| "movddup (%3), %%xmm0 \n\t" // alpha | |||
| "leaq (%1,%4,4), %2 \n\t" | |||
| "leaq (%1,%4,4), %2 \n\t" | |||
| ".p2align 4 \n\t" | |||
| ".p2align 4 \n\t" | |||
| "1: \n\t" | |||
| "movsd (%1) , %%xmm4 \n\t" | |||
| "movhpd (%1,%4,1), %%xmm4 \n\t" | |||
| "movsd (%1,%4,2), %%xmm5 \n\t" | |||
| "movhpd (%1,%5,1), %%xmm5 \n\t" | |||
| "1: \n\t" | |||
| "movsd (%1) , %%xmm4 \n\t" | |||
| "movhpd (%1,%4,1), %%xmm4 \n\t" | |||
| "movsd (%1,%4,2), %%xmm5 \n\t" | |||
| "movhpd (%1,%5,1), %%xmm5 \n\t" | |||
| "movsd (%2) , %%xmm6 \n\t" | |||
| "movhpd (%2,%4,1), %%xmm6 \n\t" | |||
| "movsd (%2,%4,2), %%xmm7 \n\t" | |||
| "movhpd (%2,%5,1), %%xmm7 \n\t" | |||
| "movsd (%2) , %%xmm6 \n\t" | |||
| "movhpd (%2,%4,1), %%xmm6 \n\t" | |||
| "movsd (%2,%4,2), %%xmm7 \n\t" | |||
| "movhpd (%2,%5,1), %%xmm7 \n\t" | |||
| "mulpd %%xmm0, %%xmm4 \n\t" | |||
| "mulpd %%xmm0, %%xmm5 \n\t" | |||
| "mulpd %%xmm0, %%xmm6 \n\t" | |||
| "mulpd %%xmm0, %%xmm7 \n\t" | |||
| "mulpd %%xmm0, %%xmm4 \n\t" | |||
| "mulpd %%xmm0, %%xmm5 \n\t" | |||
| "mulpd %%xmm0, %%xmm6 \n\t" | |||
| "mulpd %%xmm0, %%xmm7 \n\t" | |||
| "movsd %%xmm4 , (%1) \n\t" | |||
| "movhpd %%xmm4 , (%1,%4,1) \n\t" | |||
| "movsd %%xmm5 , (%1,%4,2) \n\t" | |||
| "movhpd %%xmm5 , (%1,%5,1) \n\t" | |||
| "movsd %%xmm4 , (%1) \n\t" | |||
| "movhpd %%xmm4 , (%1,%4,1) \n\t" | |||
| "movsd %%xmm5 , (%1,%4,2) \n\t" | |||
| "movhpd %%xmm5 , (%1,%5,1) \n\t" | |||
| "movsd %%xmm6 , (%2) \n\t" | |||
| "movhpd %%xmm6 , (%2,%4,1) \n\t" | |||
| "movsd %%xmm7 , (%2,%4,2) \n\t" | |||
| "movhpd %%xmm7 , (%2,%5,1) \n\t" | |||
| "movsd %%xmm6 , (%2) \n\t" | |||
| "movhpd %%xmm6 , (%2,%4,1) \n\t" | |||
| "movsd %%xmm7 , (%2,%4,2) \n\t" | |||
| "movhpd %%xmm7 , (%2,%5,1) \n\t" | |||
| "leaq (%1,%4,8), %1 \n\t" | |||
| "leaq (%2,%4,8), %2 \n\t" | |||
| "leaq (%1,%4,8), %1 \n\t" | |||
| "leaq (%2,%4,8), %2 \n\t" | |||
| "subq $8, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| "subq $8, %0 \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| "+r" (n), // 0 | |||
| @@ -150,106 +150,96 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0,j=0; | |||
| if ( inc_x != 1 ) | |||
| { | |||
| if ( da == 0.0 ) | |||
| { | |||
| BLASLONG n1 = n & -2; | |||
| while(j < n1) | |||
| { | |||
| if (isinf(x[i])||isnan(x[i])) | |||
| x[i]=NAN; | |||
| else x[i]=0.0; | |||
| if (isinf(x[i+inc_x])||isnan(x[i+inc_x])) | |||
| x[i+inc_x]=NAN; | |||
| else x[i+inc_x]=0.0; | |||
| i += 2*inc_x ; | |||
| j+=2; | |||
| } | |||
| while(j < n) | |||
| { | |||
| if (isinf(x[i])||isnan(x[i])) | |||
| x[i]=NAN; | |||
| else x[i]=0.0; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| dscal_kernel_inc_8(n1, &da, x, inc_x); | |||
| i = n1 * inc_x; | |||
| j = n1; | |||
| } | |||
| while(j < n) | |||
| { | |||
| x[i] *= da; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| // if ( da == 0.0 ) | |||
| // dscal_kernel_8_zero(n1 , &da , x); | |||
| // else | |||
| dscal_kernel_8(n1 , &da , x); | |||
| } | |||
| if ( da == 0.0 ) | |||
| { | |||
| for ( i=n1 ; i<n; i++ ) | |||
| { | |||
| if(isinf(x[i])||isnan(x[i])) | |||
| x[i]=NAN; | |||
| else x[i] = 0.0; | |||
| } | |||
| } | |||
| else if (isinf(da)){ | |||
| for ( i=n1 ; i<n; i++) | |||
| if (x[i]==0.) x[i]=NAN; | |||
| else x[i] *=da; | |||
| } | |||
| else | |||
| { | |||
| for ( i=n1 ; i<n; i++ ) | |||
| { | |||
| if(isinf(x[i])) | |||
| x[i]=NAN; | |||
| else x[i] *= da; | |||
| } | |||
| } | |||
| return(0); | |||
| BLASLONG i = 0, j = 0; | |||
| // Resolved issue 4728 when the caller is dscal | |||
| if (dummy2 == 1 && da == 0.0) | |||
| { | |||
| if ( inc_x != 1 ) | |||
| { | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| dscal_kernel_inc_8(n1, &da, x, inc_x); | |||
| i = n1 * inc_x; | |||
| j = n1; | |||
| } | |||
| while(j < n) | |||
| { | |||
| x[i] *= da; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0) | |||
| dscal_kernel_8(n1 , &da , x); | |||
| for ( i = n1 ; i < n; i++ ) | |||
| x[i] *= da; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( inc_x != 1 ) | |||
| { | |||
| if( da == 0.0) | |||
| { | |||
| BLASLONG n1 = n & -2; | |||
| while(j < n1) | |||
| { | |||
| x[i] = 0.0; | |||
| x[i+inc_x] = 0.0; | |||
| i += 2 * inc_x ; | |||
| j += 2; | |||
| } | |||
| while(j < n) | |||
| { | |||
| x[i] = 0.0; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| dscal_kernel_inc_8(n1, &da, x, inc_x); | |||
| i = n1 * inc_x; | |||
| j = n1; | |||
| } | |||
| while(j < n) | |||
| { | |||
| x[i] *= da; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( da == 0.0 ) | |||
| { | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0) | |||
| dscal_kernel_8_zero(n1, &da, x); | |||
| for ( i = n1 ; i < n; i++ ) | |||
| x[i] = 0.0; | |||
| } | |||
| else | |||
| { | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0) | |||
| dscal_kernel_8(n1 , &da , x); | |||
| for ( i = n1 ; i < n; i++ ) | |||
| x[i] *= da; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -60,8 +60,10 @@ | |||
| #ifdef WINDOWS_ABI | |||
| movq 40(%rsp), X | |||
| movq 48(%rsp), INCX | |||
| movq 64(%rsp), %r9 | |||
| movaps %xmm3, %xmm0 | |||
| #else | |||
| movq 24(%rsp), %r9 | |||
| #endif | |||
| SAVEREGISTERS | |||
| @@ -73,6 +75,10 @@ | |||
| lea (, INCX, SIZE), INCX | |||
| comisd %xmm0, %xmm1 | |||
| jne .L100 | |||
| jp .L100 | |||
| cmpq $1, %r9 | |||
| je .L100 | |||
| /* Alpha == ZERO */ | |||
| cmpq $SIZE, INCX | |||
| @@ -60,8 +60,10 @@ | |||
| #ifdef WINDOWS_ABI | |||
| movq 40(%rsp), X | |||
| movq 48(%rsp), INCX | |||
| movq 64(%rsp), %r9 | |||
| movaps %xmm3, %xmm0 | |||
| #else | |||
| movq 24(%rsp), %r9 | |||
| #endif | |||
| SAVEREGISTERS | |||
| @@ -76,6 +78,8 @@ | |||
| shufps $0, %xmm0, %xmm0 | |||
| jne .L100 # Alpha != ZERO | |||
| cmpq $1, %r9 | |||
| je .L100 | |||
| /* Alpha == ZERO */ | |||
| cmpq $SIZE, INCX | |||
| @@ -48,6 +48,7 @@ | |||
| #define X ARG2 | |||
| #define INCX ARG3 | |||
| #endif | |||
| #define FLAG %r9 | |||
| #define XX %r10 | |||
| #define I %rax | |||
| @@ -60,8 +61,10 @@ | |||
| #ifdef WINDOWS_ABI | |||
| movq 40(%rsp), X | |||
| movq 48(%rsp), INCX | |||
| movq 64(%rsp), FLAG | |||
| movaps %xmm3, %xmm0 | |||
| #else | |||
| movq 24(%rsp), FLAG | |||
| #endif | |||
| SAVEREGISTERS | |||
| @@ -75,6 +78,8 @@ | |||
| comisd %xmm0, %xmm1 | |||
| jne .L100 # Alpha != ZERO | |||
| jp .L100 # For Alpha = NaN | |||
| cmpq $1, FLAG | |||
| je .L100 # disable the Alpha=zero path as it does not handle x=inf or nan | |||
| /* Alpha == ZERO */ | |||
| cmpq $SIZE, INCX | |||
| @@ -39,21 +39,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x ) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT alpha = *da; | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| x[0] *= alpha; | |||
| x[1] *= alpha; | |||
| x[2] *= alpha; | |||
| x[3] *= alpha; | |||
| x[4] *= alpha; | |||
| x[5] *= alpha; | |||
| x[6] *= alpha; | |||
| x[7] *= alpha; | |||
| x+=8; | |||
| } | |||
| BLASLONG i; | |||
| FLOAT alpha = *da; | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| x[0] *= alpha; | |||
| x[1] *= alpha; | |||
| x[2] *= alpha; | |||
| x[3] *= alpha; | |||
| x[4] *= alpha; | |||
| x[5] *= alpha; | |||
| x[6] *= alpha; | |||
| x[7] *= alpha; | |||
| x+=8; | |||
| } | |||
| } | |||
| @@ -61,19 +61,19 @@ static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x ) | |||
| static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) | |||
| { | |||
| BLASLONG i; | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| x[0] = 0.0; | |||
| x[1] = 0.0; | |||
| x[2] = 0.0; | |||
| x[3] = 0.0; | |||
| x[4] = 0.0; | |||
| x[5] = 0.0; | |||
| x[6] = 0.0; | |||
| x[7] = 0.0; | |||
| x+=8; | |||
| } | |||
| BLASLONG i; | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| x[0] = 0.0; | |||
| x[1] = 0.0; | |||
| x[2] = 0.0; | |||
| x[3] = 0.0; | |||
| x[4] = 0.0; | |||
| x[5] = 0.0; | |||
| x[6] = 0.0; | |||
| x[7] = 0.0; | |||
| x+=8; | |||
| } | |||
| } | |||
| @@ -85,126 +85,119 @@ static void sscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ | |||
| static void sscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG inc_x2 = 2 * inc_x; | |||
| BLASLONG inc_x3 = inc_x2 + inc_x; | |||
| FLOAT t0,t1,t2,t3; | |||
| FLOAT da = alpha[0]; | |||
| BLASLONG i; | |||
| BLASLONG inc_x2 = 2 * inc_x; | |||
| BLASLONG inc_x3 = inc_x2 + inc_x; | |||
| FLOAT t0,t1,t2,t3; | |||
| FLOAT da = alpha[0]; | |||
| for ( i=0; i<n; i+=4 ) | |||
| { | |||
| t0 = da * x[0]; | |||
| t1 = da * x[inc_x]; | |||
| t2 = da * x[inc_x2]; | |||
| t3 = da * x[inc_x3]; | |||
| for ( i=0; i<n; i+=4 ) | |||
| { | |||
| t0 = da * x[0]; | |||
| t1 = da * x[inc_x]; | |||
| t2 = da * x[inc_x2]; | |||
| t3 = da * x[inc_x3]; | |||
| x[0] = t0; | |||
| x[inc_x] = t1; | |||
| x[inc_x2] = t2; | |||
| x[inc_x3] = t3; | |||
| x[0] = t0; | |||
| x[inc_x] = t1; | |||
| x[inc_x2] = t2; | |||
| x[inc_x3] = t3; | |||
| x+=4*inc_x; | |||
| x+=4*inc_x; | |||
| } | |||
| } | |||
| } | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0,j=0; | |||
| if ( inc_x != 1 ) | |||
| { | |||
| if ( da == 0.0 ) | |||
| { | |||
| BLASLONG n1 = n & -2; | |||
| while(j < n1) | |||
| { | |||
| if (isinf(x[i])||isnan(x[i])) | |||
| x[i]=NAN; | |||
| else x[i]=0.0; | |||
| if (isinf(x[i+inc_x])||isnan(x[i+inc_x])) | |||
| x[i+inc_x]=NAN; | |||
| else x[i+inc_x]=0.0; | |||
| i += 2*inc_x ; | |||
| j+=2; | |||
| } | |||
| while(j < n) | |||
| { | |||
| if (isinf(x[i])||isnan(x[i])) | |||
| x[i]=NAN; | |||
| else x[i]=0.0; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| #if 1 | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| sscal_kernel_inc_8(n1, &da, x, inc_x); | |||
| i = n1 * inc_x; | |||
| j = n1; | |||
| } | |||
| #endif | |||
| while(j < n) | |||
| { | |||
| x[i] *= da; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| //if ( da == 0.0 ) | |||
| // sscal_kernel_16_zero(n1 , &da , x); | |||
| //else | |||
| sscal_kernel_16(n1 , &da , x); | |||
| } | |||
| if ( da == 0.0 ) | |||
| { | |||
| for ( i=n1 ; i<n; i++ ) | |||
| { | |||
| if (isinf(x[i])||isnan(x[i])) | |||
| x[i]=NAN; | |||
| else x[i]=0.0; | |||
| } | |||
| } | |||
| else if ( isinf(da) ) | |||
| { | |||
| for ( i=n1 ; i<n; i++ ) | |||
| { | |||
| if (x[i] == 0.0) | |||
| x[i]=NAN; | |||
| else x[i] *= da; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for ( i=n1 ; i<n; i++ ) | |||
| { | |||
| if (isinf(x[i])) | |||
| x[i]=NAN; | |||
| else x[i] *= da; | |||
| } | |||
| } | |||
| return(0); | |||
| BLASLONG i = 0, j = 0; | |||
| // Resolved issue 4728 when the caller is sscal | |||
| if (dummy2 == 1 && da == 0.0) | |||
| { | |||
| if ( inc_x != 1 ) | |||
| { | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| sscal_kernel_inc_8(n1, &da, x, inc_x); | |||
| i = n1 * inc_x; | |||
| j = n1; | |||
| } | |||
| while(j < n) | |||
| { | |||
| x[i] *= da; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0) | |||
| sscal_kernel_16(n1 , &da , x); | |||
| for ( i = n1 ; i < n; i++ ) | |||
| x[i] *= da; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( inc_x != 1 ) | |||
| { | |||
| if( da == 0.0) | |||
| { | |||
| BLASLONG n1 = n & -2; | |||
| while(j < n1) | |||
| { | |||
| x[i] = 0.0; | |||
| x[i+inc_x] = 0.0; | |||
| i += 2 * inc_x ; | |||
| j += 2; | |||
| } | |||
| while(j < n) | |||
| { | |||
| x[i] = 0.0; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| sscal_kernel_inc_8(n1, &da, x, inc_x); | |||
| i = n1 * inc_x; | |||
| j = n1; | |||
| } | |||
| while(j < n) | |||
| { | |||
| x[i] *= da; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( da == 0.0 ) | |||
| { | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0) | |||
| sscal_kernel_16_zero(n1, &da, x); | |||
| for ( i = n1 ; i < n; i++ ) | |||
| x[i] = 0.0; | |||
| } | |||
| else | |||
| { | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0) | |||
| sscal_kernel_16(n1 , &da , x); | |||
| for ( i = n1 ; i < n; i++ ) | |||
| x[i] *= da; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -74,7 +74,7 @@ | |||
| pxor %xmm15, %xmm15 | |||
| comisd %xmm0, %xmm15 | |||
| jne .L30 # Alpha_r != ZERO | |||
| jp .L30 | |||
| comisd %xmm1, %xmm15 | |||
| jne .L30 # Alpha_i != ZERO | |||
| @@ -76,7 +76,7 @@ | |||
| pxor %xmm15, %xmm15 | |||
| comiss %xmm0, %xmm15 | |||
| jne .L100 # Alpha_r != ZERO | |||
| jp .L100 # Alpha_r == NAN | |||
| comiss %xmm1, %xmm15 | |||
| jne .L100 # Alpha_i != ZERO | |||
| @@ -234,12 +234,23 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| } else { | |||
| while (j < n1) { | |||
| temp0 = -da_i * x[i + 1]; | |||
| x[i + 1] = da_i * x[i]; | |||
| if (isnan(x[i]) || isinf(x[i])) | |||
| temp0 = NAN; | |||
| else | |||
| temp0 = -da_i * x[i + 1]; | |||
| if (!isinf(x[i + 1])) | |||
| x[i + 1] = da_i * x[i]; | |||
| else | |||
| x[i + 1] = NAN; | |||
| x[i] = temp0; | |||
| temp1 = -da_i * x[i + 1 + inc_x]; | |||
| x[i + 1 + inc_x] = da_i * x[i + inc_x]; | |||
| if (isnan(x[i+inc_x]) || isinf(x[i+inc_x])) | |||
| temp1 = NAN; | |||
| else | |||
| temp1 = -da_i * x[i + 1 + inc_x]; | |||
| if (!isinf(x[i + 1 + inc_x])) | |||
| x[i + 1 + inc_x] = da_i * x[i + inc_x]; | |||
| else | |||
| x[i + 1 + inc_x] = NAN; | |||
| x[i + inc_x] = temp1; | |||
| i += 2 * inc_x; | |||
| j += 2; | |||
| @@ -247,9 +258,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| } | |||
| while (j < n) { | |||
| temp0 = -da_i * x[i + 1]; | |||
| x[i + 1] = da_i * x[i]; | |||
| if (isnan(x[i]) || isinf(x[i])) | |||
| temp0 = NAN; | |||
| else | |||
| temp0 = -da_i * x[i + 1]; | |||
| if (isinf(x[i + 1])) | |||
| x[i + 1] = NAN; | |||
| else | |||
| x[i + 1] = da_i * x[i]; | |||
| x[i] = temp0; | |||
| i += inc_x; | |||
| j++; | |||
| @@ -332,26 +348,42 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| j = n1; | |||
| } | |||
| if (da_r == 0.0) { | |||
| if (da_r == 0.0 || isnan(da_r)) { | |||
| if (da_i == 0.0) { | |||
| float res = 0.0; | |||
| if (isnan(da_r)) res = da_r; | |||
| while (j < n) { | |||
| x[i] = 0.0; | |||
| x[i + 1] = 0.0; | |||
| x[i] = res; | |||
| x[i + 1] = res; | |||
| i += 2; | |||
| j++; | |||
| } | |||
| } else if (isinf(da_r)) { | |||
| while(j < n) | |||
| { | |||
| x[i]= NAN; | |||
| x[i+1] = da_r; | |||
| i += 2 ; | |||
| j++; | |||
| } | |||
| } else { | |||
| while (j < n) { | |||
| temp0 = -da_i * x[i + 1]; | |||
| x[i + 1] = da_i * x[i]; | |||
| x[i] = temp0; | |||
| if (isinf(x[i])) temp0 = NAN; | |||
| if (!isinf(x[i + 1])) | |||
| x[i + 1] = da_i * x[i]; | |||
| else | |||
| x[i + 1] = NAN; | |||
| if (x[i] == x[i]) | |||
| x[i] = temp0; | |||
| i += 2; | |||
| j++; | |||
| @@ -96,20 +96,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
| if (inc_x == 1) { | |||
| if (da == 0.0) { | |||
| BLASLONG n1 = n & -16; | |||
| if (n1 > 0) { | |||
| dscal_kernel_16_zero(n1, x); | |||
| j = n1; | |||
| if (dummy2 == 0) { | |||
| BLASLONG n1 = n & -16; | |||
| if (n1 > 0) { | |||
| dscal_kernel_16_zero(n1, x); | |||
| j = n1; | |||
| } | |||
| while (j < n) { | |||
| x[j] = 0.0; | |||
| j++; | |||
| } | |||
| } else { | |||
| while (j < n) { | |||
| if (isfinite(x[j])) | |||
| x[j] = 0.0; | |||
| else | |||
| x[j] = NAN; | |||
| j++; | |||
| } | |||
| } | |||
| while (j < n) { | |||
| x[j] = 0.0; | |||
| j++; | |||
| } | |||
| } else { | |||
| BLASLONG n1 = n & -16; | |||
| @@ -127,11 +135,9 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
| } else { | |||
| if (da == 0.0) { | |||
| if (dummy2 == 0) { | |||
| BLASLONG n1 = n & -4; | |||
| while (j < n1) { | |||
| x[i] = 0.0; | |||
| x[i + inc_x] = 0.0; | |||
| x[i + 2 * inc_x] = 0.0; | |||
| @@ -139,11 +145,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
| i += inc_x * 4; | |||
| j += 4; | |||
| } | |||
| } | |||
| while (j < n) { | |||
| x[i] = 0.0; | |||
| if (dummy2==0 || isfinite(x[i])) | |||
| x[i] = 0.0; | |||
| else | |||
| x[i] = NAN; | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| @@ -95,21 +95,31 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
| if (inc_x == 1) { | |||
| if (da == 0.0) { | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| sscal_kernel_32_zero(n1, x); | |||
| j = n1; | |||
| } | |||
| while (j < n) { | |||
| x[j] = 0.0; | |||
| j++; | |||
| if (da == 0.0 || !isfinite(da)) { | |||
| if (dummy2 == 0) { | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| sscal_kernel_32_zero(n1, x); | |||
| j = n1; | |||
| } | |||
| while (j < n) { | |||
| x[j] = 0.0; | |||
| j++; | |||
| } | |||
| } else { | |||
| float res = 0.0; | |||
| if (!isfinite(da)) res = NAN; | |||
| while (j < n) { | |||
| if (isfinite(x[i])) | |||
| x[j] = res; | |||
| else | |||
| x[j] = NAN; | |||
| j++; | |||
| } | |||
| } | |||
| } else { | |||
| BLASLONG n1 = n & -32; | |||
| @@ -126,26 +136,37 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, | |||
| } else { | |||
| if (da == 0.0) { | |||
| BLASLONG n1 = n & -2; | |||
| while (j < n1) { | |||
| x[i] = 0.0; | |||
| x[i + inc_x] = 0.0; | |||
| i += inc_x * 2; | |||
| j += 2; | |||
| } | |||
| while (j < n) { | |||
| x[i] = 0.0; | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| if (da == 0.0 || !isfinite(da)) { | |||
| if (dummy2 == 0) { | |||
| BLASLONG n1 = n & -2; | |||
| while (j < n1) { | |||
| x[i] = 0.0; | |||
| x[i + inc_x] = 0.0; | |||
| i += inc_x * 2; | |||
| j += 2; | |||
| } | |||
| while (j < n) { | |||
| x[i] = 0.0; | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| } else { | |||
| while (j < n) { | |||
| float res = 0.0; | |||
| if (!isfinite(da)) res = NAN; | |||
| if (isfinite(x[i])) | |||
| x[i] = res; | |||
| else | |||
| x[i] = NAN; | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| } | |||
| } else { | |||
| BLASLONG n1 = n & -2; | |||
| @@ -237,13 +237,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| temp0 = NAN; | |||
| else | |||
| temp0 = -da_i * x[i + 1]; | |||
| x[i + 1] = da_i * x[i]; | |||
| if (!isinf(x[i + 1])) | |||
| x[i + 1] = da_i * x[i]; | |||
| else | |||
| x[i + 1] = NAN; | |||
| x[i] = temp0; | |||
| if (isnan(x[i + inc_x]) || isinf(x[i + inc_x])) | |||
| temp1 = NAN; | |||
| else | |||
| temp1 = -da_i * x[i + 1 + inc_x]; | |||
| x[i + 1 + inc_x] = da_i * x[i + inc_x]; | |||
| if (!isinf(x[i + 1 + inc_x])) | |||
| x[i + 1 + inc_x] = da_i * x[i + inc_x]; | |||
| else | |||
| x[i + 1 + inc_x] = NAN; | |||
| x[i + inc_x] = temp1; | |||
| i += 2 * inc_x; | |||
| j += 2; | |||
| @@ -256,7 +262,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| temp0 = NAN; | |||
| else | |||
| temp0 = -da_i * x[i + 1]; | |||
| x[i + 1] = da_i * x[i]; | |||
| if (!isinf(x[i +1])) | |||
| x[i + 1] = da_i * x[i]; | |||
| else | |||
| x[i + 1] = NAN; | |||
| x[i] = temp0; | |||
| i += inc_x; | |||
| j++; | |||
| @@ -330,7 +339,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| zscal_kernel_8_zero(n1, x); | |||
| else | |||
| zscal_kernel_8(n1, da_r, da_i, x); | |||
| else if (da_i == 0) | |||
| else if (da_i == 0 && da_r == da_r) | |||
| zscal_kernel_8_zero_i(n1, alpha, x); | |||
| else | |||
| zscal_kernel_8(n1, da_r, da_i, x); | |||
| @@ -339,29 +348,41 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||
| j = n1; | |||
| } | |||
| if (da_r == 0.0) { | |||
| if (da_r == 0.0 || isnan(da_r)) { | |||
| if (da_i == 0.0) { | |||
| double res= 0.0; | |||
| if (isnan(da_r)) res = da_r; | |||
| while (j < n) { | |||
| x[i] = 0.0; | |||
| x[i + 1] = 0.0; | |||
| x[i] = res; | |||
| x[i + 1] = res; | |||
| i += 2; | |||
| j++; | |||
| } | |||
| } else if (isinf(da_r)) { | |||
| while (j < n) { | |||
| x[i] = NAN; | |||
| x[i + 1] = da_r; | |||
| i += 2; | |||
| j++; | |||
| } | |||
| } else { | |||
| while (j < n) { | |||
| if (isnan(x[i]) || isinf(x[i])) | |||
| if (isinf(x[i])) | |||
| temp0 = NAN; | |||
| else | |||
| temp0 = -da_i * x[i + 1]; | |||
| x[i + 1] = da_i * x[i]; | |||
| x[i] = temp0; | |||
| if (!isinf(x[i + 1])) | |||
| x[i + 1] = da_i * x[i]; | |||
| else | |||
| x[i + 1] = NAN; | |||
| if (x[i]==x[i]) | |||
| x[i] = temp0; | |||
| i += 2; | |||
| j++; | |||
| @@ -256,7 +256,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; | |||
| #define myceiling_(w) {ceil(w)} | |||
| #define myhuge_(w) {HUGE_VAL} | |||
| //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} | |||
| #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) | |||
| #define mymaxloc_(w,s,e,n) smaxloc_(w,*(s),*(e),n) | |||
| /* procedure parameter types for -A and -C++ */ | |||
| @@ -256,7 +256,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; | |||
| #define myceiling_(w) {ceil(w)} | |||
| #define myhuge_(w) {HUGE_VAL} | |||
| //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} | |||
| #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) | |||
| #define mymaxloc_(w,s,e,n) smaxloc_(w,*(s),*(e),n) | |||
| /* procedure parameter types for -A and -C++ */ | |||
| @@ -256,7 +256,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; | |||
| #define myceiling_(w) {ceil(w)} | |||
| #define myhuge_(w) {HUGE_VAL} | |||
| //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} | |||
| #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) | |||
| #define mymaxloc_(w,s,e,n) smaxloc_(w,*(s),*(e),n) | |||
| /* procedure parameter types for -A and -C++ */ | |||
| @@ -256,7 +256,7 @@ static char junk[] = "\n@(#)LIBF77 VERSION 19990503\n"; | |||
| #define myceiling_(w) {ceil(w)} | |||
| #define myhuge_(w) {HUGE_VAL} | |||
| //#define mymaxloc_(w,s,e,n) {if (sizeof(*(w)) == sizeof(double)) dmaxloc_((w),*(s),*(e),n); else dmaxloc_((w),*(s),*(e),n);} | |||
| #define mymaxloc_(w,s,e,n) dmaxloc_(w,*(s),*(e),n) | |||
| #define mymaxloc_(w,s,e,n) smaxloc_(w,*(s),*(e),n) | |||
| /* procedure parameter types for -A and -C++ */ | |||
| @@ -1,6 +1,11 @@ | |||
| TOPDIR = ../../.. | |||
| include ../../../Makefile.system | |||
| ifeq ($(DYNAMIC_ARCH), 1) | |||
| LASWP = ../generic/laswp_k_4.c | |||
| ZLASWP = ../generic/zlaswp_k_4.c | |||
| endif | |||
| ifndef LASWP | |||
| LASWP = ../generic/laswp_k.c | |||
| endif | |||
| @@ -2,7 +2,7 @@ TOPDIR = .. | |||
| include ../Makefile.system | |||
| ifeq ($(F_COMPILER),GFORTRAN) | |||
| ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) | |||
| override FFLAGS = $(filter_out(-O2 -O3,$(FFLAGS)) -O0 | |||
| override FFLAGS = $(filter_out(-O2 -O3,$(FFLAGS))) -O0 | |||
| endif | |||
| override FFLAGS += -fno-tree-vectorize | |||
| endif | |||
| @@ -18,6 +18,7 @@ else () | |||
| test_zscal.c | |||
| test_amin.c | |||
| test_axpby.c | |||
| test_gemv.c | |||
| ) | |||
| endif () | |||
| @@ -14,7 +14,7 @@ UTESTEXTBIN=openblas_utest_ext | |||
| include $(TOPDIR)/Makefile.system | |||
| OBJS=utest_main.o test_min.o test_amax.o test_ismin.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o test_swap.o test_rot.o test_dnrm2.o test_zscal.o \ | |||
| test_amin.o test_axpby.o | |||
| test_amin.o test_axpby.o test_gemv.o | |||
| #test_rot.o test_swap.o test_axpy.o test_dotu.o test_dsdot.o test_fork.o | |||
| OBJS_EXT=utest_main.o $(DIR_EXT)/xerbla.o $(DIR_EXT)/common.o | |||
| OBJS_EXT+=$(DIR_EXT)/test_isamin.o $(DIR_EXT)/test_idamin.o $(DIR_EXT)/test_icamin.o $(DIR_EXT)/test_izamin.o | |||
| @@ -0,0 +1,130 @@ | |||
| #include "openblas_utest.h" | |||
| #include <cblas.h> | |||
| #ifndef NAN | |||
| #define NAN 0.0/0.0 | |||
| #endif | |||
| #ifndef INFINITY | |||
| #define INFINITY 1.0/0.0 | |||
| #endif | |||
| #ifdef BUILD_SINGLE | |||
| CTEST(sgemv, 0_nan_inf) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint incX = 1; | |||
| blasint incY = 1; | |||
| float alpha = 0.0; | |||
| float beta = 0.0; | |||
| char trans = 'N'; | |||
| float A[17 * 17]; | |||
| float X[17]; | |||
| float Y[17]; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| for (i = 0; i < (N - 1); i += 2) | |||
| { | |||
| Y[i] = NAN; | |||
| Y[i + 1] = INFINITY; | |||
| } | |||
| Y[N - 1] = NAN; | |||
| BLASFUNC(sgemv)(&trans, &N, &N, &alpha, A, &N, X, &incX, &beta, Y, &incY); | |||
| for (i = 0; i < N; i ++) | |||
| ASSERT_TRUE(Y[i] == 0.0); | |||
| } | |||
| CTEST(sgemv, 0_nan_inf_incy_2) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint Ny = 33; | |||
| blasint incX = 1; | |||
| blasint incY = 2; | |||
| float alpha = 0.0; | |||
| float beta = 0.0; | |||
| char trans = 'N'; | |||
| float A[17 * 17]; | |||
| float X[17]; | |||
| float Y[33]; | |||
| float *ay = Y; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| memset(Y, 0, sizeof(Y)); | |||
| for (i = 0; i < (N - 1); i += 2) | |||
| { | |||
| ay[0] = NAN; | |||
| ay += 2; | |||
| ay[0] = INFINITY; | |||
| ay += 2; | |||
| } | |||
| Y[Ny - 1] = NAN; | |||
| BLASFUNC(sgemv)(&trans, &N, &N, &alpha, A, &N, X, &incX, &beta, Y, &incY); | |||
| for (i = 0; i < Ny; i ++) | |||
| ASSERT_TRUE(Y[i] == 0.0); | |||
| } | |||
| #endif | |||
| #ifdef BUILD_DOUBLE | |||
| CTEST(dgemv, 0_nan_inf) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint incX = 1; | |||
| blasint incY = 1; | |||
| double alpha = 0.0; | |||
| double beta = 0.0; | |||
| char trans = 'N'; | |||
| double A[17 * 17]; | |||
| double X[17]; | |||
| double Y[17]; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| for (i = 0; i < (N - 1); i += 2) | |||
| { | |||
| Y[i] = NAN; | |||
| Y[i + 1] = INFINITY; | |||
| } | |||
| Y[N - 1] = NAN; | |||
| BLASFUNC(dgemv)(&trans, &N, &N, &alpha, A, &N, X, &incX, &beta, Y, &incY); | |||
| for (i = 0; i < N; i ++) | |||
| ASSERT_TRUE(Y[i] == 0.0); | |||
| } | |||
| CTEST(dgemv, 0_nan_inf_incy_2) | |||
| { | |||
| int i; | |||
| blasint N = 17; | |||
| blasint Ny = 33; | |||
| blasint incX = 1; | |||
| blasint incY = 2; | |||
| double alpha = 0.0; | |||
| double beta = 0.0; | |||
| char trans = 'N'; | |||
| double A[17 * 17]; | |||
| double X[17]; | |||
| double Y[33]; | |||
| double *ay = Y; | |||
| memset(A, 0, sizeof(A)); | |||
| memset(X, 0, sizeof(X)); | |||
| memset(Y, 0, sizeof(Y)); | |||
| for (i = 0; i < (N - 1); i += 2) | |||
| { | |||
| ay[0] = NAN; | |||
| ay += 2; | |||
| ay[0] = INFINITY; | |||
| ay += 2; | |||
| } | |||
| Y[Ny - 1] = NAN; | |||
| BLASFUNC(dgemv)(&trans, &N, &N, &alpha, A, &N, X, &incX, &beta, Y, &incY); | |||
| for (i = 0; i < Ny; i ++) | |||
| ASSERT_TRUE(Y[i] == 0.0); | |||
| } | |||
| #endif | |||
| @@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include "openblas_utest.h" | |||
| #pragma GCC optimize("no-gcse") | |||
| /* | |||
| void BLASFUNC(cpotrf)(char*, BLASINT*, complex float*, BLASINT*, BLASINT*); | |||
| void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*, | |||