Update from develop for release 0.3.20tags/v0.3.20
| @@ -251,12 +251,14 @@ if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS) | |||
| set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| set (CMAKE_Fortran_CREATE_SHARED_LIBRARY | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " | |||
| "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | |||
| "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" | |||
| "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" | |||
| "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") | |||
| else () | |||
| set (CMAKE_C_CREATE_SHARED_LIBRARY | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " | |||
| "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | |||
| "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") | |||
| endif () | |||
| endif() | |||
| @@ -201,3 +201,9 @@ In chronological order: | |||
| * Bine Brank <https://github.com/binebrank> | |||
| * [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE | |||
| * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM | |||
| * [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions | |||
| * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions | |||
| * [2022-01-18] SVE kernels and copy functions for TRSM | |||
| * Ilya Kurdyukov <https://github.com/ilyakurdyukov> | |||
| * [2021-02-21] Add basic support for the Elbrus E2000 architecture | |||
| @@ -1,4 +1,39 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.20 | |||
| 20-Feb-2022 | |||
| general: | |||
| - some code cleanup, with added casts etc. | |||
| - fixed obtaining the cpu count with OpenMP and OMP_PROC_BIND unset | |||
| - fixed pivot index calculation by ?LASWP for negative increments other than one | |||
| - fixed input argument check in LAPACK ? GEQRT2 | |||
| - improved the check for a Fortran compiler in CMAKE builds | |||
| - disabled building OpenBLAS' optimized versions of LAPACK complex SPMV,SPR,SYMV,SYR with NO_LAPACK=1 | |||
| - fixed building of LAPACK on certain distributed filesystems with parallel gmake | |||
| - fixed building the shared library on MacOS with classic flang | |||
| x86_64: | |||
| - fixed cross-compilation with CMAKE for CORE2 target | |||
| - fixed miscompilation of AVX512 code in DYNAMIC_ARCH builds | |||
| - added support for the "incidental" AVX512 hardware in Alder Lake when enabled in BIOS | |||
| E2K: | |||
| - add new architecture (Russian Elbrus E2000 family) | |||
| SPARC: | |||
| - fix IMIN/IMAX | |||
| ARMV8: | |||
| - added SVE-enabled CGEMM and ZGEMM kernels for ARMV8SVE and A64FX | |||
| - added support for Neoverse N2 and V1 cpus | |||
| MIPS,MIPS64: | |||
| - fixed autodetection of MSA capability | |||
| LOONGARCH64: | |||
| - added an optimized DGEMM kernel | |||
| ==================================================================== | |||
| Version 0.3.19 | |||
| 19-Dec-2021 | |||
| @@ -78,6 +78,66 @@ endif | |||
| endif | |||
| endif | |||
| # Use a72 tunings because Neoverse-V1 is only available | |||
| # in GCC>=9.4 | |||
| ifeq ($(CORE), NEOVERSEV1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) | |||
| CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.4-a -mtune=native | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a -mtune=native | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| endif | |||
| # Use a72 tunings because Neoverse-N2 is only available | |||
| # in GCC>=9.4 | |||
| ifeq ($(CORE), NEOVERSEN2) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) | |||
| CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.5-a -mtune=native | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.5-a -mtune=native | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| endif | |||
| # Use a53 tunings because a55 is only available in GCC>=8.1 | |||
| ifeq ($(CORE), CORTEXA55) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| @@ -0,0 +1 @@ | |||
| COPT = -Wall -O2 # -DGEMMTEST | |||
| @@ -3,6 +3,10 @@ | |||
| export BINARY | |||
| export USE_OPENMP | |||
| ifdef DYNAMIC_ARCH | |||
| override HOST_CFLAGS += -DDYNAMIC_ARCH | |||
| endif | |||
| ifdef TARGET_CORE | |||
| TARGET_MAKE = Makefile_kernel.conf | |||
| TARGET_CONF = config_kernel.h | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.19 | |||
| VERSION = 0.3.19.dev | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -277,7 +277,7 @@ HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo | |||
| GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS) | |||
| # Generating Makefile.conf and config.h | |||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) | |||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) | |||
| ifndef TARGET_CORE | |||
| include $(TOPDIR)/Makefile.conf | |||
| @@ -374,6 +374,7 @@ else | |||
| endif | |||
| GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1) | |||
| GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) | |||
| GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4) | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) | |||
| endif | |||
| @@ -654,6 +655,8 @@ DYNAMIC_CORE += CORTEXA57 | |||
| DYNAMIC_CORE += CORTEXA72 | |||
| DYNAMIC_CORE += CORTEXA73 | |||
| DYNAMIC_CORE += NEOVERSEN1 | |||
| DYNAMIC_CORE += NEOVERSEV1 | |||
| DYNAMIC_CORE += NEOVERSEN2 | |||
| DYNAMIC_CORE += CORTEXA55 | |||
| DYNAMIC_CORE += FALKOR | |||
| DYNAMIC_CORE += THUNDERX | |||
| @@ -93,6 +93,8 @@ CORTEXA57 | |||
| CORTEXA72 | |||
| CORTEXA73 | |||
| NEOVERSEN1 | |||
| NEOVERSEV1 | |||
| NEOVERSEN2 | |||
| CORTEXA55 | |||
| EMAG8180 | |||
| FALKOR | |||
| @@ -113,3 +115,7 @@ C910V | |||
| 11.LOONGARCH64: | |||
| LOONGSON3R5 | |||
| 12. Elbrus E2000: | |||
| E2K | |||
| @@ -224,7 +224,7 @@ jobs: | |||
| - job: OSX_IOS_ARMV8 | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| vmImage: 'macOS-11' | |||
| variables: | |||
| CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0 | |||
| @@ -84,6 +84,7 @@ $os = Haiku if ($data =~ /OS_HAIKU/); | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| $architecture = e2k if ($data =~ /ARCH_E2K/); | |||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||
| @@ -124,6 +125,11 @@ if ($architecture eq "zarch") { | |||
| $binary = 64; | |||
| } | |||
| if ($architecture eq "e2k") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| } | |||
| if ($architecture eq "alpha") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| @@ -223,6 +229,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| $architecture = e2k if ($data =~ /ARCH_E2K/); | |||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||
| @@ -44,7 +44,7 @@ endif () | |||
| if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110) | |||
| if (DYNAMIC_LIST) | |||
| set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | |||
| endif () | |||
| @@ -20,19 +20,16 @@ | |||
| # NEEDBUNDERSCORE | |||
| # NEED2UNDERSCORES | |||
| if (NOT NO_LAPACK) | |||
| include(CheckLanguage) | |||
| check_language(Fortran) | |||
| if(CMAKE_Fortran_COMPILER) | |||
| enable_language(Fortran) | |||
| else() | |||
| message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") | |||
| include(CheckLanguage) | |||
| check_language(Fortran) | |||
| if(CMAKE_Fortran_COMPILER) | |||
| enable_language(Fortran) | |||
| else() | |||
| if (NOT NO_LAPACK) | |||
| message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") | |||
| endif() | |||
| set (NOFORTRAN 1) | |||
| set (NO_LAPACK 1) | |||
| endif() | |||
| else() | |||
| include(CMakeForceCompiler) | |||
| CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) | |||
| endif() | |||
| if (NOT ONLY_CBLAS) | |||
| @@ -127,6 +127,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| "#define DLOCAL_BUFFER_SIZE\t16384\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t16384\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t16384\n") | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSSE3 1) | |||
| set(SGEMM_UNROLL_M 8) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 4) | |||
| @@ -237,6 +241,61 @@ endif () | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN1") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t65536\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| "#define L1_CODE_ASSOCIATIVE\t4\n" | |||
| "#define L1_DATA_SIZE\t65536\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L1_DATA_ASSOCIATIVE\t4\n" | |||
| "#define L2_SIZE\t1048576\n\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define L2_ASSOCIATIVE\t8\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t48\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_VFPV4\n" | |||
| "#define HAVE_VFPV3\n" | |||
| "#define HAVE_VFP\n" | |||
| "#define HAVE_NEON\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "NEOVERSEV1") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t65536\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| "#define L1_CODE_ASSOCIATIVE\t4\n" | |||
| "#define L1_DATA_SIZE\t65536\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L1_DATA_ASSOCIATIVE\t4\n" | |||
| "#define L2_SIZE\t1048576\n\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define L2_ASSOCIATIVE\t8\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t48\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_VFPV4\n" | |||
| "#define HAVE_VFPV3\n" | |||
| "#define HAVE_VFP\n" | |||
| "#define HAVE_NEON\n" | |||
| "#define HAVE_SVE\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN2") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t65536\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| @@ -246,13 +305,14 @@ endif () | |||
| "#define L1_DATA_ASSOCIATIVE\t2\n" | |||
| "#define L2_SIZE\t1048576\n\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define L2_ASSOCIATIVE\t16\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define L2_ASSOCIATIVE\t8\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t48\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_VFPV4\n" | |||
| "#define HAVE_VFPV3\n" | |||
| "#define HAVE_VFP\n" | |||
| "#define HAVE_NEON\n" | |||
| "#define HAVE_SVE\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| @@ -125,7 +125,7 @@ macro(ParseMakefileVars MAKEFILE_IN) | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}") | |||
| set (ElseSeen 0) | |||
| if (DEFINED ${CMAKE_MATCH_2}) | |||
| if (${CMAKE_MATCH_2}) | |||
| if (${CMAKE_MATCH_1} STREQUAL "ifdef") | |||
| #message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| @@ -474,6 +474,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||
| #include "common_loongarch64.h" | |||
| #endif | |||
| #ifdef ARCH_E2K | |||
| #include "common_e2k.h" | |||
| #endif | |||
| #ifndef ASSEMBLER | |||
| #ifdef OS_WINDOWSSTORE | |||
| typedef char env_var_t[MAX_PATH]; | |||
| @@ -0,0 +1,64 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #ifndef COMMON_E2K | |||
| #define COMMON_E2K | |||
| #ifdef ASSEMBLER | |||
| #error | |||
| #endif | |||
| #define MB do { __asm__ __volatile__("": : :"memory"); } while (0) | |||
| #define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) | |||
| #define RMB | |||
| #define INLINE __attribute__((__always_inline__)) inline | |||
| static inline int blas_quickdivide(blasint x, blasint y) { | |||
| return x / y; | |||
| } | |||
| #ifndef PAGESIZE | |||
| #define PAGESIZE ( 4 << 10) | |||
| #endif | |||
| #define HUGE_PAGESIZE ( 2 << 20) | |||
| #ifndef BUFFERSIZE | |||
| #define BUFFER_SIZE (32 << 20) | |||
| #else | |||
| #define BUFFER_SIZE (32 << BUFFERSIZE) | |||
| #endif | |||
| #define SEEK_ADDRESS | |||
| #endif | |||
| @@ -2611,7 +2611,7 @@ | |||
| #ifndef ASSEMBLER | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ | |||
| || defined(ARCH_LOONGARCH64) | |||
| || defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) | |||
| extern BLASLONG gemm_offset_a; | |||
| extern BLASLONG gemm_offset_b; | |||
| extern BLASLONG sbgemm_p; | |||
| @@ -43,6 +43,8 @@ size_t length64=sizeof(value64); | |||
| #define CPU_CORTEXA72 4 | |||
| #define CPU_CORTEXA73 5 | |||
| #define CPU_NEOVERSEN1 11 | |||
| #define CPU_NEOVERSEV1 16 | |||
| #define CPU_NEOVERSEN2 17 | |||
| // Qualcomm | |||
| #define CPU_FALKOR 6 | |||
| // Cavium | |||
| @@ -71,6 +73,8 @@ static char *cpuname[] = { | |||
| "TSV110", | |||
| "EMAG8180", | |||
| "NEOVERSEN1", | |||
| "NEOVERSEV1" | |||
| "NEOVERSEN2" | |||
| "THUNDERX3T110", | |||
| "VORTEX", | |||
| "CORTEXA55", | |||
| @@ -90,6 +94,8 @@ static char *cpuname_lower[] = { | |||
| "tsv110", | |||
| "emag8180", | |||
| "neoversen1", | |||
| "neoversev1", | |||
| "neoversen2", | |||
| "thunderx3t110", | |||
| "vortex", | |||
| "cortexa55", | |||
| @@ -170,6 +176,10 @@ int detect(void) | |||
| return CPU_CORTEXA73; | |||
| else if (strstr(cpu_part, "0xd0c")) | |||
| return CPU_NEOVERSEN1; | |||
| else if (strstr(cpu_part, "0xd40")) | |||
| return CPU_NEOVERSEV1; | |||
| else if (strstr(cpu_part, "0xd49")) | |||
| return CPU_NEOVERSEN2; | |||
| else if (strstr(cpu_part, "0xd05")) | |||
| return CPU_CORTEXA55; | |||
| } | |||
| @@ -338,11 +348,41 @@ void get_cpuconfig(void) | |||
| printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 48\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_NEOVERSEV1: | |||
| printf("#define %s\n", cpuname[d]); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 4\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 48\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_NEOVERSEN2: | |||
| printf("#define %s\n", cpuname[d]); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 4\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 48\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_FALKOR: | |||
| printf("#define FALKOR\n"); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| @@ -165,7 +165,7 @@ void get_cpuconfig(void){ | |||
| }else{ | |||
| printf("#define UNKNOWN\n"); | |||
| } | |||
| if (!get_feature(msa)) printf("#define NO_MSA\n"); | |||
| if (!get_feature("msa")) printf("#define NO_MSA\n"); | |||
| } | |||
| void get_libname(void){ | |||
| @@ -193,7 +193,7 @@ int get_feature(char *search) | |||
| while (fgets(buffer, sizeof(buffer), infile)) | |||
| { | |||
| if (!strncmp("Features", buffer, 8)) | |||
| if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16)) | |||
| { | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| @@ -207,7 +207,7 @@ int get_feature(char *search) | |||
| t = strtok(p," "); | |||
| while( t = strtok(NULL," ")) | |||
| { | |||
| if (!strcmp(t, search)) { return(1); } | |||
| if (strstr(t, search)) { return(1); } | |||
| } | |||
| #endif | |||
| @@ -201,7 +201,7 @@ void get_cpuconfig(void){ | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| } | |||
| if (!get_feature(msa)) printf("#define NO_MSA\n"); | |||
| if (!get_feature("msa")) printf("#define NO_MSA\n"); | |||
| } | |||
| void get_libname(void){ | |||
| @@ -233,7 +233,7 @@ int get_feature(char *search) | |||
| while (fgets(buffer, sizeof(buffer), infile)) | |||
| { | |||
| if (!strncmp("Features", buffer, 8)) | |||
| if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16)) | |||
| { | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| @@ -247,7 +247,7 @@ int get_feature(char *search) | |||
| t = strtok(p," "); | |||
| while( t = strtok(NULL," ")) | |||
| { | |||
| if (!strcmp(t, search)) { return(1); } | |||
| if (strstr(t, search)) { return(1); } | |||
| } | |||
| #endif | |||
| @@ -323,9 +323,11 @@ int get_vendor(void){ | |||
| int get_cputype(int gettype){ | |||
| int eax, ebx, ecx, edx; | |||
| /* | |||
| int extend_family, family; | |||
| int extend_model, model; | |||
| int type, stepping; | |||
| */ | |||
| int feature = 0; | |||
| cpuid(1, &eax, &ebx, &ecx, &edx); | |||
| @@ -428,7 +430,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ | |||
| cpuid(0, &cpuid_level, &ebx, &ecx, &edx); | |||
| if (cpuid_level > 1) { | |||
| int numcalls =0 ; | |||
| int numcalls; | |||
| cpuid(2, &eax, &ebx, &ecx, &edx); | |||
| numcalls = BITMASK(eax, 0, 0xff); //FIXME some systems may require repeated calls to read all entries | |||
| info[ 0] = BITMASK(eax, 8, 0xff); | |||
| @@ -1492,6 +1495,10 @@ int get_cpuname(void){ | |||
| switch (model) { | |||
| case 7: // Alder Lake desktop | |||
| case 10: // Alder Lake mobile | |||
| if(support_avx512_bf16()) | |||
| return CPUTYPE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| @@ -1637,7 +1644,6 @@ int get_cpuname(void){ | |||
| else | |||
| return CPUTYPE_BARCELONA; | |||
| } | |||
| break; | |||
| case 10: // Zen3 | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| @@ -2193,7 +2199,6 @@ int get_coretype(void){ | |||
| else | |||
| return CORE_NEHALEM; | |||
| #endif | |||
| break; | |||
| case 7: | |||
| if (model == 10) | |||
| @@ -2582,4 +2587,4 @@ void get_sse(void){ | |||
| if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n"); | |||
| } | |||
| //} | |||
| //} | |||
| @@ -165,3 +165,7 @@ ARCH_LOONGARCH64 | |||
| HAVE_C11 | |||
| #endif | |||
| #if defined(__e2k__) | |||
| ARCH_E2K | |||
| #endif | |||
| @@ -64,9 +64,9 @@ CBLASOBJS += \ | |||
| chpmv_U.$(SUFFIX) chpmv_L.$(SUFFIX) chpmv_V.$(SUFFIX) chpmv_M.$(SUFFIX) \ | |||
| chpr_U.$(SUFFIX) chpr_L.$(SUFFIX) chpr_V.$(SUFFIX) chpr_M.$(SUFFIX) \ | |||
| chpr2_U.$(SUFFIX) chpr2_L.$(SUFFIX) chpr2_V.$(SUFFIX) chpr2_M.$(SUFFIX) \ | |||
| csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \ | |||
| cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \ | |||
| csyr_U.$(SUFFIX) csyr_L.$(SUFFIX) csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \ | |||
| csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) \ | |||
| cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \ | |||
| csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \ | |||
| ctbmv_NUU.$(SUFFIX) ctbmv_NUN.$(SUFFIX) ctbmv_NLU.$(SUFFIX) ctbmv_NLN.$(SUFFIX) \ | |||
| ctbmv_TUU.$(SUFFIX) ctbmv_TUN.$(SUFFIX) ctbmv_TLU.$(SUFFIX) ctbmv_TLN.$(SUFFIX) \ | |||
| ctbmv_RUU.$(SUFFIX) ctbmv_RUN.$(SUFFIX) ctbmv_RLU.$(SUFFIX) ctbmv_RLN.$(SUFFIX) \ | |||
| @@ -92,6 +92,13 @@ CBLASOBJS += \ | |||
| ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \ | |||
| ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX) | |||
| ifndef NO_LAPACK | |||
| CBLASOBJS += \ | |||
| cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \ | |||
| cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) \ | |||
| csyr_U.$(SUFFIX) csyr_L.$(SUFFIX) | |||
| endif | |||
| ZBLASOBJS += \ | |||
| zgbmv_n.$(SUFFIX) zgbmv_t.$(SUFFIX) zgbmv_r.$(SUFFIX) zgbmv_c.$(SUFFIX) \ | |||
| zgbmv_o.$(SUFFIX) zgbmv_u.$(SUFFIX) zgbmv_s.$(SUFFIX) zgbmv_d.$(SUFFIX) \ | |||
| @@ -209,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* REAL / Double */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | |||
| double *, BLASLONG, double *, BLASLONG, | |||
| double *, BLASLONG, void *) = func; | |||
| double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, | |||
| double *, BLASLONG, double *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((double *)args -> alpha)[0], | |||
| @@ -220,7 +221,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* REAL / Single */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, float *, BLASLONG, | |||
| float *, BLASLONG, void *) = func; | |||
| float *, BLASLONG, void *) = (void (*) | |||
| (BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, float *, BLASLONG, | |||
| float *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((float *)args -> alpha)[0], | |||
| @@ -232,7 +236,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* REAL / BFLOAT16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, | |||
| bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, | |||
| bfloat16 *, BLASLONG, void *) = func; | |||
| bfloat16 *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, bfloat16, | |||
| bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, | |||
| bfloat16 *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((bfloat16 *)args -> alpha)[0], | |||
| @@ -243,7 +249,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* REAL / BLAS_STOBF16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, bfloat16 *, BLASLONG, | |||
| float *, BLASLONG, void *) = func; | |||
| float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, bfloat16 *, BLASLONG, | |||
| float *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((float *)args -> alpha)[0], | |||
| @@ -254,7 +262,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* REAL / BLAS_DTOBF16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | |||
| double *, BLASLONG, bfloat16 *, BLASLONG, | |||
| double *, BLASLONG, void *) = func; | |||
| double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, | |||
| double *, BLASLONG, bfloat16 *, BLASLONG, | |||
| double *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((double *)args -> alpha)[0], | |||
| @@ -271,7 +281,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* COMPLEX / Extended Double */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, | |||
| xdouble *, BLASLONG, xdouble *, BLASLONG, | |||
| xdouble *, BLASLONG, void *) = func; | |||
| xdouble *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, | |||
| xdouble *, BLASLONG, xdouble *, BLASLONG, | |||
| xdouble *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((xdouble *)args -> alpha)[0], | |||
| @@ -285,7 +297,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* COMPLEX / Double */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, | |||
| double *, BLASLONG, double *, BLASLONG, | |||
| double *, BLASLONG, void *) = func; | |||
| double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double, | |||
| double *, BLASLONG, double *, BLASLONG, | |||
| double *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((double *)args -> alpha)[0], | |||
| @@ -297,7 +311,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* COMPLEX / Single */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, | |||
| float *, BLASLONG, float *, BLASLONG, | |||
| float *, BLASLONG, void *) = func; | |||
| float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, float, | |||
| float *, BLASLONG, float *, BLASLONG, | |||
| float *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((float *)args -> alpha)[0], | |||
| @@ -425,7 +441,7 @@ blas_queue_t *tscq; | |||
| #endif | |||
| if (queue) { | |||
| int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; | |||
| int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine; | |||
| atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); | |||
| @@ -503,7 +519,7 @@ blas_queue_t *tscq; | |||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | |||
| } else | |||
| if (queue -> mode & BLAS_PTHREAD) { | |||
| void (*pthreadcompat)(void *) = queue -> routine; | |||
| void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine; | |||
| (pthreadcompat)(queue -> args); | |||
| } else | |||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||
| @@ -871,13 +887,13 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| fprintf(STDERR, "\n"); | |||
| #endif | |||
| routine = queue -> routine; | |||
| routine = (int (*)(blas_arg_t *, void *, void *, double *, double *, BLASLONG))queue -> routine; | |||
| if (queue -> mode & BLAS_LEGACY) { | |||
| legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); | |||
| } else | |||
| if (queue -> mode & BLAS_PTHREAD) { | |||
| void (*pthreadcompat)(void *) = queue -> routine; | |||
| void (*pthreadcompat)(void *) = (void (*)(void*))queue -> routine; | |||
| (pthreadcompat)(queue -> args); | |||
| } else | |||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, | |||
| @@ -708,8 +708,11 @@ static gotoblas_t *get_coretype(void){ | |||
| case 9: | |||
| if (model == 7 || model == 10) { // Alder Lake | |||
| if(support_avx512_bf16()) | |||
| return &gotoblas_COOPERLAKE; | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| if(support_avx2()){ | |||
| openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | |||
| return &gotoblas_HASWELL; | |||
| } | |||
| if(support_avx()) { | |||
| @@ -147,6 +147,8 @@ static char *corename[] = { | |||
| "tsv110", | |||
| "emag8180", | |||
| "neoversen1", | |||
| "neoversev1", | |||
| "neoversen2", | |||
| "thunderx3t110", | |||
| "cortexa55", | |||
| "unknown" | |||
| @@ -232,11 +232,11 @@ int get_num_procs(void); | |||
| #else | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| int ret; | |||
| #if defined(__GLIBC_PREREQ) | |||
| cpu_set_t cpuset,*cpusetp; | |||
| size_t size; | |||
| int ret; | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| int i; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| @@ -249,7 +249,8 @@ int get_num_procs(void) { | |||
| #if defined(USE_OPENMP) | |||
| #if _OPENMP >= 201511 | |||
| nums = omp_get_num_places(); | |||
| ret = omp_get_num_places(); | |||
| if (ret >0 ) nums = ret; | |||
| #endif | |||
| return nums; | |||
| #endif | |||
| @@ -1800,11 +1801,12 @@ int get_num_procs(void); | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| int ret; | |||
| #if defined(__GLIBC_PREREQ) | |||
| cpu_set_t cpuset,*cpusetp; | |||
| size_t size; | |||
| int ret; | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| int i; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| @@ -1818,7 +1820,8 @@ int get_num_procs(void) { | |||
| #if defined(USE_OPENMP) | |||
| /* if (omp_get_proc_bind() != omp_proc_bind_false) */ | |||
| #if _OPENMP >= 201511 | |||
| nums = omp_get_num_places(); | |||
| ret = omp_get_num_places(); | |||
| if (ret >0 ) nums = ret; | |||
| #endif | |||
| return nums; | |||
| #endif | |||
| @@ -142,10 +142,14 @@ ifneq (,$(filter 1 2,$(NOFORTRAN))) | |||
| else | |||
| ifeq ($(F_COMPILER), INTEL) | |||
| $(FC) $(FFLAGS) $(LDFLAGS) -all-load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def | |||
| else | |||
| ifeq ($(F_COMPILER), FLANG) | |||
| $(FC) $(FFLAGS) $(LDFLAGS) -fno-fortran-main -Mnomain -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| else | |||
| $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| endif | |||
| endif | |||
| endif | |||
| dllinit.$(SUFFIX) : dllinit.c | |||
| $(CC) $(CFLAGS) -c -o $(@F) -s $< | |||
| @@ -361,6 +361,7 @@ if ($link ne "") { | |||
| ($flags =~ /^\-l/) | |||
| && ($flags !~ /ibrary/) | |||
| && ($flags !~ /gfortranbegin/) | |||
| && ($flags !~ /flangmain/) | |||
| && ($flags !~ /frtbegin/) | |||
| && ($flags !~ /pathfstart/) | |||
| && ($flags !~ /crt[0-9]/) | |||
| @@ -1302,12 +1302,47 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8 " \ | |||
| "-march=armv8.2-a -mtune=cortex-a72" | |||
| "-march=armv8.2-a -mtune=neoverse-n1" | |||
| #define LIBNAME "neoversen1" | |||
| #define CORENAME "NEOVERSEN1" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_NEOVERSEV1 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "NEOVERSEV1" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DNEOVERSEV1 " \ | |||
| "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ | |||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ | |||
| "-march=armv8.4-a -mtune=neoverse-v1" | |||
| #define LIBNAME "neoversev1" | |||
| #define CORENAME "NEOVERSEV1" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_NEOVERSEN2 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "NEOVERSEN2" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DNEOVERSEN2 " \ | |||
| "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ | |||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ | |||
| "-march=armv8.5-a -mtune=neoverse-n2" | |||
| #define LIBNAME "neoversen2" | |||
| #define CORENAME "NEOVERSEN2" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_CORTEXA55 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| @@ -1501,6 +1536,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(FORCE_E2K) || defined(__e2k__) | |||
| #define FORCE | |||
| #define ARCHITECTURE "E2K" | |||
| #define ARCHCONFIG "-DGENERIC " \ | |||
| "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| #define LIBNAME "generic" | |||
| #define CORENAME "generic" | |||
| #endif | |||
| #ifndef FORCE | |||
| #ifdef USER_TARGET | |||
| @@ -28,14 +28,21 @@ set(BLAS1_MANGLED_SOURCES | |||
| # these all have 'z' sources for complex versions | |||
| set(BLAS2_SOURCES | |||
| gemv.c ger.c | |||
| trsv.c trmv.c symv.c | |||
| syr.c syr2.c gbmv.c | |||
| sbmv.c spmv.c | |||
| spr.c spr2.c | |||
| trsv.c trmv.c | |||
| syr2.c gbmv.c | |||
| sbmv.c | |||
| spr2.c | |||
| tbsv.c tbmv.c | |||
| tpsv.c tpmv.c | |||
| ) | |||
| set(BLAS2_REAL_ONLY_SOURCES | |||
| symv.c syr.c spmv.c spr.c | |||
| ) | |||
| set(BLAS2_COMPLEX_LAPACK_SOURCES | |||
| symv.c syr.c spmv.c spr.c | |||
| ) | |||
| set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES | |||
| hemv.c hbmv.c | |||
| her.c her2.c | |||
| @@ -78,6 +85,10 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) | |||
| GenerateNamedObjects("${BLAS1_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1) | |||
| GenerateNamedObjects("${BLAS1_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) | |||
| GenerateNamedObjects("${BLAS2_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) | |||
| GenerateNamedObjects("${BLAS2_REAL_ONLY_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 1) | |||
| if (NOT DEFINED NO_LAPACK) | |||
| GenerateNamedObjects("${BLAS2_COMPLEX_LAPACK_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) | |||
| endif () | |||
| GenerateNamedObjects("${BLAS2_COMPLEX_ONLY_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false 4) | |||
| GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) | |||
| GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) | |||
| @@ -1016,11 +1016,13 @@ dsymv.$(SUFFIX) dsymv.$(PSUFFIX) : symv.c | |||
| qsymv.$(SUFFIX) qsymv.$(PSUFFIX) : symv.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| ifndef NO_LAPACK | |||
| csymv.$(SUFFIX) csymv.$(PSUFFIX) : zsymv.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| zsymv.$(SUFFIX) zsymv.$(PSUFFIX) : zsymv.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| xsymv.$(SUFFIX) xsymv.$(PSUFFIX) : zsymv.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| @@ -1034,11 +1036,13 @@ dsyr.$(SUFFIX) dsyr.$(PSUFFIX) : syr.c | |||
| qsyr.$(SUFFIX) qsyr.$(PSUFFIX) : syr.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| ifndef NO_LAPACK | |||
| csyr.$(SUFFIX) csyr.$(PSUFFIX) : zsyr.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| zsyr.$(SUFFIX) zsyr.$(PSUFFIX) : zsyr.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| xsyr.$(SUFFIX) xsyr.$(PSUFFIX) : zsyr.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| @@ -1106,11 +1110,13 @@ dspmv.$(SUFFIX) dspmv.$(PSUFFIX) : spmv.c | |||
| qspmv.$(SUFFIX) qspmv.$(PSUFFIX) : spmv.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| ifndef NO_LAPACK | |||
| cspmv.$(SUFFIX) cspmv.$(PSUFFIX) : zspmv.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| zspmv.$(SUFFIX) zspmv.$(PSUFFIX) : zspmv.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| xspmv.$(SUFFIX) xspmv.$(PSUFFIX) : zspmv.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| @@ -1124,11 +1130,13 @@ dspr.$(SUFFIX) dspr.$(PSUFFIX) : spr.c | |||
| qspr.$(SUFFIX) qspr.$(PSUFFIX) : spr.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| ifndef NO_LAPACK | |||
| cspr.$(SUFFIX) cspr.$(PSUFFIX) : zspr.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| zspr.$(SUFFIX) zspr.$(PSUFFIX) : zspr.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| xspr.$(SUFFIX) xspr.$(PSUFFIX) : zspr.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| @@ -115,7 +115,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc | |||
| #endif | |||
| blas_level1_thread(mode, n, 0, 0, &alpha, | |||
| x, incx, y, incy, NULL, 0, (void *)AXPYU_K, nthreads); | |||
| x, incx, y, incy, NULL, 0, (int (*)(void))AXPYU_K, nthreads); | |||
| } | |||
| #endif | |||
| @@ -102,7 +102,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx){ | |||
| #else | |||
| &alpha, | |||
| #endif | |||
| x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); | |||
| x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); | |||
| } | |||
| #endif | |||
| @@ -128,9 +128,9 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in | |||
| blas_level1_thread(mode, n, 0, 0, ALPHA, x, incx, y, incy, NULL, 0, | |||
| #ifndef CONJ | |||
| (void *)AXPYU_K, | |||
| (int (*)(void))AXPYU_K, | |||
| #else | |||
| (void *)AXPYC_K, | |||
| (int (*)(void))AXPYC_K, | |||
| #endif | |||
| nthreads); | |||
| } | |||
| @@ -108,7 +108,7 @@ void CNAME(blasint n, FLOAT alpha_r, void *vx, blasint incx){ | |||
| mode = BLAS_SINGLE | BLAS_COMPLEX; | |||
| #endif | |||
| blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (void *)SCAL_K, nthreads); | |||
| blas_level1_thread(mode, n, 0, 0, alpha, x, incx, NULL, 0, NULL, 0, (int (*)(void))SCAL_K, nthreads); | |||
| } | |||
| #endif | |||
| @@ -323,55 +323,93 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| #hemm | |||
| GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "hemm_iutcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "hemm_iltcopy" false "" "" false ${float_type}) | |||
| if (NOT DEFINED ${float_char}HEMMUTCOPY_M) | |||
| set(HEMMUTCOPY_M "generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(HEMMLTCOPY_M "generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| else () | |||
| set(HEMMUTCOPY_M "${KERNELDIR}/${${float_char}HEMMUTCOPY_M}") | |||
| set(HEMMLTCOPY_M "${KERNELDIR}/${${float_char}HEMMLTCOPY_M}") | |||
| endif() | |||
| GenerateNamedObjects(${HEMMUTCOPY_M} "" "hemm_iutcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${HEMMLTCOPY_M} "LOWER" "hemm_iltcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/zhemm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "hemm_outcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/zhemm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "hemm_oltcopy" false "" "" false ${float_type}) | |||
| # symm for c and z | |||
| if (NOT DEFINED ${float_char}SYMMUCOPY_M) | |||
| set(SYMMUCOPY_M "generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(SYMMLCOPY_M "generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| else () | |||
| set(SYMMUCOPY_M "${KERNELDIR}/${${float_char}SYMMUCOPY_M}") | |||
| set(SYMMLCOPY_M "${KERNELDIR}/${${float_char}SYMMLCOPY_M}") | |||
| endif() | |||
| GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "symm_outcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/zsymm_ucopy_${${float_char}GEMM_UNROLL_M}.c" "" "symm_iutcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${SYMMUCOPY_M} "" "symm_iutcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_N}.c" "LOWER;OUTER" "symm_oltcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/zsymm_lcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "symm_iltcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${SYMMLCOPY_M} "LOWER" "symm_iltcopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iunncopy" false "" "" false ${float_type}) | |||
| if (NOT DEFINED ${float_char}TRMMUNCOPY_M) | |||
| set(TRMMUNCOPY_M "generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(TRMMLNCOPY_M "generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(TRMMUTCOPY_M "generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(TRMMLTCOPY_M "generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| else () | |||
| set(TRMMUNCOPY_M "${KERNELDIR}/${${float_char}TRMMUNCOPY_M}") | |||
| set(TRMMLNCOPY_M "${KERNELDIR}/${${float_char}TRMMLNCOPY_M}") | |||
| set(TRMMUTCOPY_M "${KERNELDIR}/${${float_char}TRMMUTCOPY_M}") | |||
| set(TRMMLTCOPY_M "${KERNELDIR}/${${float_char}TRMMLTCOPY_M}") | |||
| endif () | |||
| GenerateNamedObjects(${TRMMUNCOPY_M} "UNIT" "trmm_iunucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMUNCOPY_M} "" "trmm_iunncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_ounucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_ounncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER;UNIT" "trmm_ilnucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMLNCOPY_M} "LOWER" "trmm_ilnncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_olnucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_olnncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trmm_iutncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMUTCOPY_M} "UNIT" "trmm_iutucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMUTCOPY_M} "" "trmm_iutncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trmm_outucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trmm_outncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER;UNIT" "trmm_iltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRMMLTCOPY_M} "LOWER" "trmm_iltncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) | |||
| if (NOT DEFINED ZTRSMCOPYLN_M) | |||
| set(ZTRSMUNCOPY_M "generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(ZTRSMLNCOPY_M "generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(ZTRSMUTCOPY_M "generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(ZTRSMLTCOPY_M "generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| else () | |||
| set(ZTRSMUNCOPY_M "${KERNELDIR}/${ZTRSMCOPYUN_M}") | |||
| set(ZTRSMLNCOPY_M "${KERNELDIR}/${ZTRSMCOPYLN_M}") | |||
| set(ZTRSMUTCOPY_M "${KERNELDIR}/${ZTRSMCOPYUT_M}") | |||
| set(ZTRSMLTCOPY_M "${KERNELDIR}/${ZTRSMCOPYLT_M}") | |||
| endif () | |||
| GenerateNamedObjects(${ZTRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${ZTRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${ZTRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${ZTRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${ZTRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${ZTRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/ztrsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) | |||
| @@ -465,23 +503,35 @@ endif () | |||
| GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trmm_oltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trmm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trmm_oltncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" "" false ${float_type}) | |||
| if (NOT DEFINED TRSMCOPYLN_M) | |||
| set(TRSMUNCOPY_M "generic/trsm_uncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(TRSMLNCOPY_M "generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(TRSMUTCOPY_M "generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| set(TRSMLTCOPY_M "generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c") | |||
| else () | |||
| set(TRSMUNCOPY_M "${KERNELDIR}/${TRSMCOPYUN_M}") | |||
| set(TRSMLNCOPY_M "${KERNELDIR}/${TRSMCOPYLN_M}") | |||
| set(TRSMUTCOPY_M "${KERNELDIR}/${TRSMCOPYUT_M}") | |||
| set(TRSMLTCOPY_M "${KERNELDIR}/${TRSMCOPYLT_M}") | |||
| endif () | |||
| GenerateNamedObjects(${TRSMUNCOPY_M} "UNIT" "trsm_iunucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRSMUNCOPY_M} "" "trsm_iunncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_uncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_ounncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER;UNIT" "trsm_ilnucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRSMLNCOPY_M} "LOWER" "trsm_ilnncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_olnucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_lncopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_olnncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_M}.c" "" "trsm_iutncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRSMUTCOPY_M} "UNIT" "trsm_iutucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRSMUTCOPY_M} "" "trsm_iutncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_outucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_utcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER" "trsm_outncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER;UNIT" "trsm_iltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects(${TRSMLTCOPY_M} "LOWER" "trsm_iltncopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("generic/trsm_ltcopy_${${float_char}GEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false ${float_type}) | |||
| @@ -617,6 +617,10 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) | |||
| $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) | |||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ | |||
| ifeq ($(ARCH), E2K) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(BUILD_BFLOAT16), 1) | |||
| @@ -1691,29 +1695,61 @@ $(KDIR)qtrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N | |||
| $(KDIR)qtrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(QGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | |||
| ifdef CTRMMUNCOPY_M | |||
| $(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ctrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| $(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | |||
| ifdef CTRMMLNCOPY_M | |||
| $(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | |||
| $(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef CTRMMUTCOPY_M | |||
| $(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMUTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| $(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | |||
| ifdef CTRMMLTCOPY_M | |||
| $(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | |||
| $(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMLTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| $(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
| @@ -1739,29 +1775,61 @@ $(KDIR)ctrmm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_ | |||
| $(KDIR)ctrmm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | |||
| ifdef ZTRMMUNCOPY_M | |||
| $(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ztrmm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrmm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef ZTRMMLNCOPY_M | |||
| $(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLNCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_lncopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef ZTRMMUTCOPY_M | |||
| $(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMUTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_utcopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef ZTRMMLTCOPY_M | |||
| $(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMLTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| $(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
| @@ -1897,11 +1965,21 @@ $(KDIR)csymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_N) | |||
| $(KDIR)csymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ | |||
| ifdef CSYMMUCOPY_M | |||
| $(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMUCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
| else | |||
| $(KDIR)csymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
| endif | |||
| ifdef CSYMMLCOPY_M | |||
| $(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CSYMMLCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
| else | |||
| $(KDIR)csymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
| endif | |||
| $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ | |||
| @@ -1909,11 +1987,21 @@ $(KDIR)zsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_N) | |||
| $(KDIR)zsymm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -DLOWER $< -o $@ | |||
| ifdef ZSYMMUCOPY_M | |||
| $(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMUCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
| else | |||
| $(KDIR)zsymm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER $< -o $@ | |||
| endif | |||
| ifdef ZSYMMLCOPY_M | |||
| $(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZSYMMLCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
| else | |||
| $(KDIR)zsymm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_lcopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER $< -o $@ | |||
| endif | |||
| $(KDIR)xsymm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zsymm_ucopy_$(XGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER -ULOWER $< -o $@ | |||
| @@ -1933,11 +2021,21 @@ $(KDIR)chemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_N | |||
| $(KDIR)chemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ | |||
| ifdef CHEMMUTCOPY_M | |||
| $(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMUTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ | |||
| else | |||
| $(KDIR)chemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ | |||
| endif | |||
| ifdef CHEMMLTCOPY_M | |||
| $(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CHEMMLTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | |||
| else | |||
| $(KDIR)chemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | |||
| endif | |||
| $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ | |||
| @@ -1945,11 +2043,21 @@ $(KDIR)zhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_N | |||
| $(KDIR)zhemm_oltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER $< -DLOWER -o $@ | |||
| ifdef ZHEMMUTCOPY_M | |||
| $(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMUTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ | |||
| else | |||
| $(KDIR)zhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -ULOWER -o $@ | |||
| endif | |||
| ifdef ZHEMMLTCOPY_M | |||
| $(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZHEMMLTCOPY_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | |||
| else | |||
| $(KDIR)zhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@ | |||
| endif | |||
| $(KDIR)xhemm_outcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -DOUTER $< -ULOWER -o $@ | |||
| @@ -2287,29 +2395,61 @@ $(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNR | |||
| $(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c | |||
| $(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@ | |||
| ifdef TRSMCOPYUN_M | |||
| $(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)strsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef TRSMCOPYLN_M | |||
| $(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)strsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)strsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef TRSMCOPYUT_M | |||
| $(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)strsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)strsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef TRSMCOPYLT_M | |||
| $(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)strsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)strsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| $(KDIR)strsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
| @@ -2335,29 +2475,61 @@ $(KDIR)strsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N | |||
| $(KDIR)strsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(SGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | |||
| ifdef TRSMCOPYUN_M | |||
| $(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)dtrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef TRSMCOPYLN_M | |||
| $(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)dtrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_lncopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef TRSMCOPYUT_M | |||
| $(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYUT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)dtrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_utcopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef TRSMCOPYLT_M | |||
| $(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TRSMCOPYLT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)dtrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)dtrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(DGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| $(KDIR)dtrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(DGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
| @@ -2431,29 +2603,61 @@ $(KDIR)qtrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N | |||
| $(KDIR)qtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/trsm_ltcopy_$(QGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -UCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | |||
| ifdef ZTRSMCOPYUN_M | |||
| $(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ctrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef ZTRSMCOPYLN_M | |||
| $(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ctrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef ZTRSMCOPYUT_M | |||
| $(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ctrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef ZTRSMCOPYLT_M | |||
| $(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ctrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)ctrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| $(KDIR)ctrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(CGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
| @@ -2479,29 +2683,61 @@ $(KDIR)ctrsm_oltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_ | |||
| $(KDIR)ctrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(CGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -DLOWER -UUNIT $< -o $@ | |||
| ifdef ZTRSMCOPYUN_M | |||
| $(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ztrsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrsm_iunncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef ZTRSMCOPYLN_M | |||
| $(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLN_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ztrsm_ilnucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrsm_ilnncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_lncopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef ZTRSMCOPYUT_M | |||
| $(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYUT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ztrsm_iutucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrsm_iutncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_utcopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ | |||
| endif | |||
| ifdef ZTRSMCOPYLT_M | |||
| $(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRSMCOPYLT_M) | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| else | |||
| $(KDIR)ztrsm_iltucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -DUNIT $< -o $@ | |||
| $(KDIR)ztrsm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(ZGEMM_UNROLL_M).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ | |||
| endif | |||
| $(KDIR)ztrsm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_uncopy_$(ZGEMM_UNROLL_N).c | |||
| $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ | |||
| @@ -20,25 +20,36 @@ IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| STRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| STRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| STRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| STRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| DTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| TRSMCOPYLN_M = trsm_lncopy_sve.c | |||
| TRSMCOPYLT_M = trsm_ltcopy_sve.c | |||
| TRSMCOPYUN_M = trsm_uncopy_sve.c | |||
| TRSMCOPYUT_M = trsm_utcopy_sve.c | |||
| CTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| CTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| CTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| CTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c | |||
| ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c | |||
| ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c | |||
| ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| @@ -156,28 +167,50 @@ DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| DSYMMUCOPY_M = symm_ucopy_sve.c | |||
| DSYMMLCOPY_M = symm_lcopy_sve.c | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CGEMMINCOPY = cgemm_ncopy_sve_v1.c | |||
| CGEMMITCOPY = cgemm_tcopy_sve_v1.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c | |||
| CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c | |||
| CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c | |||
| CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c | |||
| CHEMMLTCOPY_M = zhemm_ltcopy_sve.c | |||
| CHEMMUTCOPY_M = zhemm_utcopy_sve.c | |||
| CSYMMUCOPY_M = zsymm_ucopy_sve.c | |||
| CSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
| ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZGEMMINCOPY = zgemm_ncopy_sve_v1.c | |||
| ZGEMMITCOPY = zgemm_tcopy_sve_v1.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c | |||
| ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c | |||
| ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c | |||
| ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c | |||
| ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c | |||
| ZHEMMUTCOPY_M = zhemm_utcopy_sve.c | |||
| ZSYMMUCOPY_M = zsymm_ucopy_sve.c | |||
| ZSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
| @@ -20,25 +20,36 @@ IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| STRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| STRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| STRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| STRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| DTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| DTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| DTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| DTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| TRSMCOPYLN_M = trsm_lncopy_sve.c | |||
| TRSMCOPYLT_M = trsm_ltcopy_sve.c | |||
| TRSMCOPYUN_M = trsm_uncopy_sve.c | |||
| TRSMCOPYUT_M = trsm_utcopy_sve.c | |||
| CTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| CTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| CTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| CTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| ZTRSMKERNEL_LN = trsm_kernel_LN_sve.c | |||
| ZTRSMKERNEL_LT = trsm_kernel_LT_sve.c | |||
| ZTRSMKERNEL_RN = trsm_kernel_RN_sve.c | |||
| ZTRSMKERNEL_RT = trsm_kernel_RT_sve.c | |||
| ZTRSMCOPYLN_M = ztrsm_lncopy_sve.c | |||
| ZTRSMCOPYLT_M = ztrsm_ltcopy_sve.c | |||
| ZTRSMCOPYUN_M = ztrsm_uncopy_sve.c | |||
| ZTRSMCOPYUT_M = ztrsm_utcopy_sve.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| @@ -140,8 +151,8 @@ DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||
| DGEMMINCOPY = dgemm_ncopy_sve_v1.c | |||
| DGEMMITCOPY = dgemm_tcopy_sve_v1.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -156,28 +167,50 @@ DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c | |||
| DSYMMUCOPY_M = symm_ucopy_sve.c | |||
| DSYMMLCOPY_M = symm_lcopy_sve.c | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CGEMMINCOPY = cgemm_ncopy_sve_v1.c | |||
| CGEMMITCOPY = cgemm_tcopy_sve_v1.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c | |||
| CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c | |||
| CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c | |||
| CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c | |||
| CHEMMLTCOPY_M = zhemm_ltcopy_sve.c | |||
| CHEMMUTCOPY_M = zhemm_utcopy_sve.c | |||
| CSYMMUCOPY_M = zsymm_ucopy_sve.c | |||
| CSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
| ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZGEMMINCOPY = zgemm_ncopy_sve_v1.c | |||
| ZGEMMITCOPY = zgemm_tcopy_sve_v1.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c | |||
| ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c | |||
| ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c | |||
| ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c | |||
| ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c | |||
| ZHEMMUTCOPY_M = zhemm_utcopy_sve.c | |||
| ZSYMMUCOPY_M = zsymm_ucopy_sve.c | |||
| ZSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
| @@ -0,0 +1,189 @@ | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = daxpy_thunderx2t99.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SASUMKERNEL = sasum_thunderx2t99.c | |||
| DASUMKERNEL = dasum_thunderx2t99.c | |||
| CASUMKERNEL = casum_thunderx2t99.c | |||
| ZASUMKERNEL = zasum_thunderx2t99.c | |||
| SCOPYKERNEL = copy_thunderx2t99.c | |||
| DCOPYKERNEL = copy_thunderx2t99.c | |||
| CCOPYKERNEL = copy_thunderx2t99.c | |||
| ZCOPYKERNEL = copy_thunderx2t99.c | |||
| SSWAPKERNEL = swap_thunderx2t99.S | |||
| DSWAPKERNEL = swap_thunderx2t99.S | |||
| CSWAPKERNEL = swap_thunderx2t99.S | |||
| ZSWAPKERNEL = swap_thunderx2t99.S | |||
| ISAMAXKERNEL = iamax_thunderx2t99.c | |||
| IDAMAXKERNEL = iamax_thunderx2t99.c | |||
| ICAMAXKERNEL = izamax_thunderx2t99.c | |||
| IZAMAXKERNEL = izamax_thunderx2t99.c | |||
| SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| DDOTKERNEL = dot_thunderx2t99.c | |||
| SDOTKERNEL = dot_thunderx2t99.c | |||
| CDOTKERNEL = zdot_thunderx2t99.c | |||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| ifeq ($(SGEMM_UNROLL_M), 16) | |||
| SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_M), 4) | |||
| SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_N), 16) | |||
| SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S | |||
| else | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_N), 4) | |||
| SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||
| else | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
| endif | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||
| else | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| endif | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| else | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| endif | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -0,0 +1,189 @@ | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = daxpy_thunderx2t99.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SASUMKERNEL = sasum_thunderx2t99.c | |||
| DASUMKERNEL = dasum_thunderx2t99.c | |||
| CASUMKERNEL = casum_thunderx2t99.c | |||
| ZASUMKERNEL = zasum_thunderx2t99.c | |||
| SCOPYKERNEL = copy_thunderx2t99.c | |||
| DCOPYKERNEL = copy_thunderx2t99.c | |||
| CCOPYKERNEL = copy_thunderx2t99.c | |||
| ZCOPYKERNEL = copy_thunderx2t99.c | |||
| SSWAPKERNEL = swap_thunderx2t99.S | |||
| DSWAPKERNEL = swap_thunderx2t99.S | |||
| CSWAPKERNEL = swap_thunderx2t99.S | |||
| ZSWAPKERNEL = swap_thunderx2t99.S | |||
| ISAMAXKERNEL = iamax_thunderx2t99.c | |||
| IDAMAXKERNEL = iamax_thunderx2t99.c | |||
| ICAMAXKERNEL = izamax_thunderx2t99.c | |||
| IZAMAXKERNEL = izamax_thunderx2t99.c | |||
| SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| DDOTKERNEL = dot_thunderx2t99.c | |||
| SDOTKERNEL = dot_thunderx2t99.c | |||
| CDOTKERNEL = zdot_thunderx2t99.c | |||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| ifeq ($(SGEMM_UNROLL_M), 16) | |||
| SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_M), 4) | |||
| SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_N), 16) | |||
| SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S | |||
| else | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_N), 4) | |||
| SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||
| else | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
| endif | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||
| else | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| endif | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| else | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| endif | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -0,0 +1,874 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* X0 X1 X2 s0 X3 x4 x5 x6 */ | |||
| /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ | |||
| #define origM x0 | |||
| #define origN x1 | |||
| #define origK x2 | |||
| #define origPA x3 | |||
| #define origPB x4 | |||
| #define pC x5 | |||
| #define LDC x6 | |||
| #define temp x7 | |||
| #define counterL x8 | |||
| #define counterI x9 | |||
| #define counterJ x10 | |||
| #define pB x11 | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define lanes x17 | |||
| #define alphaR w19 | |||
| #define alphaI w20 | |||
| #define alphaz_R z6.s | |||
| #define alphaz_I z7.s | |||
| #define alpha0_R s4 | |||
| #define alpha0_I s5 | |||
| #define A_PRE_SIZE 2560 | |||
| #define B_PRE_SIZE 448 | |||
| #define C_PRE_SIZE 128 | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define OP_rr fmla | |||
| #define OP_ii fmls | |||
| #define OP_ri fmla | |||
| #define OP_ir fmla | |||
| #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| #define OP_rr fmla | |||
| #define OP_ii fmla | |||
| #define OP_ri fmls | |||
| #define OP_ir fmla | |||
| #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| #define OP_rr fmla | |||
| #define OP_ii fmla | |||
| #define OP_ri fmla | |||
| #define OP_ir fmls | |||
| #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| #define OP_rr fmla | |||
| #define OP_ii fmls | |||
| #define OP_ri fmls | |||
| #define OP_ir fmls | |||
| #endif | |||
| // 00 origM | |||
| // 01 origN | |||
| // 02 origK | |||
| // 03 origPA | |||
| // 04 origPB | |||
| // 05 pC | |||
| // 06 origLDC -> LDC | |||
| // 07 offset -> temp | |||
| // 08 counterL | |||
| // 09 counterI | |||
| // 10 counterJ | |||
| // 11 pB | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 pCRow3 | |||
| // 16 pA | |||
| // 17 alpha_save_R | |||
| // 18 must save alpha_save_I | |||
| // 19 must save | |||
| // 20 must save | |||
| // 21 must save | |||
| // 22 must save | |||
| // 23 must save | |||
| // 24 must save | |||
| // 25 must save | |||
| // 26 must save | |||
| // 27 must save | |||
| // 28 must save | |||
| // 29 frame | |||
| // 30 link | |||
| // 31 sp | |||
| //v00 ALPHA_R -> pA00_R, pA01_R | |||
| //v01 ALPHA_I -> pA00_I, pA01_I | |||
| //v02 pA02_R, pA03_R | |||
| //v03 pA02_I, pA03_I | |||
| //v04 pA10_R, pA11_R | |||
| //v05 pA10_I, pA11_I | |||
| //v06 pA12_R, pA13_R | |||
| //v07 pA12_I, pA13_I | |||
| //v08 must save pB00_R, pB01_R | |||
| //v09 must save pB00_I, pB01_I | |||
| //v10 must save pB02_R, pB03_R OR ALPHA0_R | |||
| //v11 must save pB02_I, pB03_I OR ALPHA0_I | |||
| //v12 must save pB10_R, pB11_R | |||
| //v13 must save pB10_I, pB11_I | |||
| //v14 must save pB12_R, pB13_R OR ALPHA1_R | |||
| //v15 must save pB12_I, pB13_I OR ALPHA1_R | |||
| //v16 pC0R | |||
| //v17 pC0I | |||
| //v18 pC1R | |||
| //v19 pC1I | |||
| //v20 pC2R | |||
| //v21 pC2I | |||
| //v22 pC3R | |||
| //v23 pC3I | |||
| //v24 pC3R | |||
| //v25 pC3I | |||
| //v26 pC22_R, pC23_R | |||
| //v27 pC22_I, pC23_I | |||
| //v28 pC30_R, pC31_R | |||
| //v29 pC30_I, pC31_I | |||
| //v30 pC32_R, pC33_R | |||
| //v31 pC32_I, pC33_I | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| .macro INITv1x4 | |||
| dup z16.s, #0 | |||
| dup z17.s, #0 | |||
| dup z18.s, #0 | |||
| dup z19.s, #0 | |||
| dup z20.s, #0 | |||
| dup z21.s, #0 | |||
| dup z22.s, #0 | |||
| dup z23.s, #0 | |||
| .endm | |||
| .macro KERNELv1x4_I | |||
| ld2w {z0.s, z1.s}, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA += lanes*2*4 | |||
| ld2w {z2.s, z3.s}, p1/z, [pA] // next one | |||
| add pA, pA, lanes, lsl #3 // pA += lanes*2*4 | |||
| ld1rw z8.s, p0/z, [pB] | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| ld1rw z12.s, p0/z, [pB, 16] | |||
| ld1rw z13.s, p0/z, [pB, 20] | |||
| ld1rw z14.s, p0/z, [pB, 24] | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| fmla z16.s, p1/m, z0.s, z8.s | |||
| OP_ir z17.s, p1/m, z1.s, z8.s | |||
| ld1rw z8.s, p0/z, [pB] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| #eor z17.16b, z17.16b, z17.16b | |||
| fmls z17.s, p1/m, z0.s, z9.s | |||
| #else | |||
| fmla z17.s, p1/m, z0.s, z9.s | |||
| #endif | |||
| OP_ii z16.s, p1/m, z1.s, z9.s | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| fmla z18.s, p1/m, z0.s, z10.s | |||
| OP_ir z19.s, p1/m, z1.s, z10.s | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| OP_ii z18.s, p1/m, z1.s, z11.s | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| #eor z19.16b, z21.16b, z21.16b | |||
| fmls z19.s, p1/m, z0.s, z11.s | |||
| #else | |||
| fmla z19.s, p1/m, z0.s, z11.s | |||
| #endif | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| fmla z20.s, p1/m, z0.s, z12.s | |||
| OP_ir z21.s, p1/m, z1.s, z12.s | |||
| ld1rw z12.s, p0/z, [pB, 16] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| #eor z21.16b, z23.16b, z23.16b | |||
| fmls z21.s, p1/m, z0.s, z13.s | |||
| #else | |||
| fmla z21.s, p1/m, z0.s, z13.s | |||
| #endif | |||
| OP_ii z20.s, p1/m, z1.s, z13.s | |||
| ld1rw z13.s, p0/z, [pB, 20] | |||
| fmla z22.s, p1/m, z0.s, z14.s | |||
| OP_ir z23.s, p1/m, z1.s, z14.s | |||
| ld1rw z14.s, p0/z, [pB, 24] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| #eor z23.16b, z19.16b, z19.16b | |||
| fmls z23.s, p1/m, z0.s, z15.s | |||
| #else | |||
| fmla z23.s, p1/m, z0.s, z15.s | |||
| #endif | |||
| OP_ii z22.s, p1/m, z1.s, z15.s | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_M1 | |||
| ld2w {z2.s, z3.s}, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes * 2 * 4 | |||
| OP_rr z16.s, p1/m, z0.s, z8.s | |||
| OP_ir z17.s, p1/m, z1.s, z8.s | |||
| ld1rw z8.s, p0/z, [pB] | |||
| OP_ii z16.s, p1/m, z1.s, z9.s | |||
| OP_ri z17.s, p1/m, z0.s, z9.s | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| OP_rr z18.s, p1/m, z0.s, z10.s | |||
| OP_ir z19.s, p1/m, z1.s, z10.s | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| OP_ii z18.s, p1/m, z1.s, z11.s | |||
| OP_ri z19.s, p1/m, z0.s, z11.s | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| OP_rr z20.s, p1/m, z0.s, z12.s | |||
| OP_ir z21.s, p1/m, z1.s, z12.s | |||
| ld1rw z12.s, p0/z, [pB, 16] | |||
| OP_ii z20.s, p1/m, z1.s, z13.s | |||
| OP_ri z21.s, p1/m, z0.s, z13.s | |||
| ld1rw z13.s, p0/z, [pB, 20] | |||
| OP_rr z22.s, p1/m, z0.s, z14.s | |||
| OP_ir z23.s, p1/m, z1.s, z14.s | |||
| ld1rw z14.s, p0/z, [pB, 24] | |||
| OP_ii z22.s, p1/m, z1.s, z15.s | |||
| OP_ri z23.s, p1/m, z0.s, z15.s | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_M2 | |||
| ld2w {z0.s, z1.s}, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes *2 * 4 | |||
| OP_rr z16.s, p1/m, z2.s, z8.s | |||
| OP_ir z17.s, p1/m, z3.s, z8.s | |||
| ld1rw z8.s, p0/z, [pB] | |||
| OP_ii z16.s, p1/m, z3.s, z9.s | |||
| OP_ri z17.s, p1/m, z2.s, z9.s | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| OP_rr z18.s, p1/m, z2.s, z10.s | |||
| OP_ir z19.s, p1/m, z3.s, z10.s | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| OP_ii z18.s, p1/m, z3.s, z11.s | |||
| OP_ri z19.s, p1/m, z2.s, z11.s | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| OP_rr z20.s, p1/m, z2.s, z12.s | |||
| OP_ir z21.s, p1/m, z3.s, z12.s | |||
| ld1rw z12.s, p0/z, [pB, 16] | |||
| OP_ii z20.s, p1/m, z3.s, z13.s | |||
| OP_ri z21.s, p1/m, z2.s, z13.s | |||
| ld1rw z13.s, p0/z, [pB, 20] | |||
| OP_rr z22.s, p1/m, z2.s, z14.s | |||
| OP_ir z23.s, p1/m, z3.s, z14.s | |||
| ld1rw z14.s, p0/z, [pB, 24] | |||
| OP_ii z22.s, p1/m, z3.s, z15.s | |||
| OP_ri z23.s, p1/m, z2.s, z15.s | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| add pB, pB, 32 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_E | |||
| OP_rr z16.s, p1/m, z2.s, z8.s | |||
| OP_ir z17.s, p1/m, z3.s, z8.s | |||
| OP_ii z16.s, p1/m, z3.s, z9.s | |||
| OP_ri z17.s, p1/m, z2.s, z9.s | |||
| OP_rr z18.s, p1/m, z2.s, z10.s | |||
| OP_ir z19.s, p1/m, z3.s, z10.s | |||
| OP_ii z18.s, p1/m, z3.s, z11.s | |||
| OP_ri z19.s, p1/m, z2.s, z11.s | |||
| OP_rr z20.s, p1/m, z2.s, z12.s | |||
| OP_ir z21.s, p1/m, z3.s, z12.s | |||
| OP_ii z20.s, p1/m, z3.s, z13.s | |||
| OP_ri z21.s, p1/m, z2.s, z13.s | |||
| OP_rr z22.s, p1/m, z2.s, z14.s | |||
| OP_ir z23.s, p1/m, z3.s, z14.s | |||
| OP_ii z22.s, p1/m, z3.s, z15.s | |||
| OP_ri z23.s, p1/m, z2.s, z15.s | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_SUB | |||
| ld2w {z0.s, z1.s}, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 | |||
| ld1rw z8.s, p0/z, [pB] | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| OP_rr z16.s, p1/m, z0.s, z8.s | |||
| OP_ir z17.s, p1/m, z1.s, z8.s | |||
| OP_ii z16.s, p1/m, z1.s, z9.s | |||
| OP_ri z17.s, p1/m, z0.s, z9.s | |||
| ld1rw z12.s, p0/z, [pB, 16] | |||
| ld1rw z13.s, p0/z, [pB, 20] | |||
| ld1rw z14.s, p0/z, [pB, 24] | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| OP_rr z18.s, p1/m, z0.s, z10.s | |||
| OP_ir z19.s, p1/m, z1.s, z10.s | |||
| OP_ii z18.s, p1/m, z1.s, z11.s | |||
| OP_ri z19.s, p1/m, z0.s, z11.s | |||
| add pB, pB, 32 | |||
| OP_rr z20.s, p1/m, z0.s, z12.s | |||
| OP_ir z21.s, p1/m, z1.s, z12.s | |||
| OP_ii z20.s, p1/m, z1.s, z13.s | |||
| OP_ri z21.s, p1/m, z0.s, z13.s | |||
| OP_rr z22.s, p1/m, z0.s, z14.s | |||
| OP_ir z23.s, p1/m, z1.s, z14.s | |||
| OP_ii z22.s, p1/m, z1.s, z15.s | |||
| OP_ri z23.s, p1/m, z0.s, z15.s | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| .endm | |||
| .macro SAVEv1x4 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaz_R | |||
| fmls z24.s, p1/m, z17.s, alphaz_I | |||
| fmla z25.s, p1/m, z16.s, alphaz_I | |||
| fmla z25.s, p1/m, z17.s, alphaz_R | |||
| st2w {z24.s, z25.s}, p1, [pCRow0] | |||
| add pCRow0, pCRow0, lanes, lsl #3 | |||
| ld2w {z26.s, z27.s}, p1/z, [pCRow1] | |||
| fmla z26.s, p1/m, z18.s, alphaz_R | |||
| fmls z26.s, p1/m, z19.s, alphaz_I | |||
| fmla z27.s, p1/m, z18.s, alphaz_I | |||
| fmla z27.s, p1/m, z19.s, alphaz_R | |||
| st2w {z26.s, z27.s}, p1, [pCRow1] | |||
| add pCRow1, pCRow1, lanes, lsl #3 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld2w {z28.s, z29.s}, p1/z, [pCRow2] | |||
| fmla z28.s, p1/m, z20.s, alphaz_R | |||
| fmls z28.s, p1/m, z21.s, alphaz_I | |||
| fmla z29.s, p1/m, z20.s, alphaz_I | |||
| fmla z29.s, p1/m, z21.s, alphaz_R | |||
| st2w {z28.s, z29.s}, p1, [pCRow2] | |||
| add pCRow2, pCRow2, lanes, lsl #3 | |||
| ld2w {z30.s, z31.s}, p1/z, [pCRow3] | |||
| fmla z30.s, p1/m, z22.s, alphaz_R | |||
| fmls z30.s, p1/m, z23.s, alphaz_I | |||
| fmla z31.s, p1/m, z22.s, alphaz_I | |||
| fmla z31.s, p1/m, z23.s, alphaz_R | |||
| st2w {z30.s, z31.s}, p1, [pCRow3] | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x2 | |||
| dup z16.s, #0 | |||
| dup z17.s, #0 | |||
| dup z18.s, #0 | |||
| dup z19.s, #0 | |||
| .endm | |||
| .macro KERNELv1x2_SUB | |||
| ld2w {z0.s, z1.s}, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 | |||
| ld1rw z8.s, p0/z, [pB] | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| ld1rw z10.s, p0/z, [pB, 8] | |||
| ld1rw z11.s, p0/z, [pB, 12] | |||
| OP_rr z16.s, p1/m, z0.s, z8.s | |||
| OP_ir z17.s, p1/m, z1.s, z8.s | |||
| OP_ii z16.s, p1/m, z1.s, z9.s | |||
| OP_ri z17.s, p1/m, z0.s, z9.s | |||
| OP_rr z18.s, p1/m, z0.s, z10.s | |||
| OP_ir z19.s, p1/m, z1.s, z10.s | |||
| OP_ii z18.s, p1/m, z1.s, z11.s | |||
| OP_ri z19.s, p1/m, z0.s, z11.s | |||
| add pB, pB, 16 | |||
| .endm | |||
| .macro SAVEv1x2 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaz_R | |||
| fmls z24.s, p1/m, z17.s, alphaz_I | |||
| fmla z25.s, p1/m, z16.s, alphaz_I | |||
| fmla z25.s, p1/m, z17.s, alphaz_R | |||
| st2w {z24.s, z25.s}, p1, [pCRow0] | |||
| add pCRow0, pCRow0, lanes, lsl #3 | |||
| ld2w {z26.s, z27.s}, p1/z, [pCRow1] | |||
| fmla z26.s, p1/m, z18.s, alphaz_R | |||
| fmls z26.s, p1/m, z19.s, alphaz_I | |||
| fmla z27.s, p1/m, z18.s, alphaz_I | |||
| fmla z27.s, p1/m, z19.s, alphaz_R | |||
| st2w {z26.s, z27.s}, p1, [pCRow1] | |||
| add pCRow1, pCRow1, lanes, lsl #3 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x1 | |||
| dup z16.s, #0 | |||
| dup z17.s, #0 | |||
| .endm | |||
| .macro KERNELv1x1_SUB | |||
| ld2w {z0.s, z1.s}, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #3 // pA = pA + lanes* 2 * 4 | |||
| ld1rw z8.s, p0/z, [pB] | |||
| ld1rw z9.s, p0/z, [pB, 4] | |||
| add pB, pB, 8 | |||
| OP_rr z16.s, p1/m, z0.s, z8.s | |||
| OP_ir z17.s, p1/m, z1.s, z8.s | |||
| OP_ii z16.s, p1/m, z1.s, z9.s | |||
| OP_ri z17.s, p1/m, z0.s, z9.s | |||
| .endm | |||
| .macro SAVEv1x1 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaz_R | |||
| fmls z24.s, p1/m, z17.s, alphaz_I | |||
| fmla z25.s, p1/m, z16.s, alphaz_I | |||
| fmla z25.s, p1/m, z17.s, alphaz_R | |||
| st2w {z24.s, z25.s}, p1, [pCRow0] | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alphaR, s0 | |||
| dup alphaz_R, alphaR | |||
| fmov alphaI, s1 | |||
| dup alphaz_I, alphaI | |||
| lsl LDC, LDC, #3 // ldc = ldc * 2 * 4 | |||
| ptrue p0.s // create true predicate | |||
| mov pB, origPB | |||
| // Loop over N | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble .Lcgemm_kernel_L2_BEGIN | |||
| /******************************************************************************/ | |||
| .Lcgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow3, pCRow2, LDC | |||
| add pC, pCRow3, LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Lcgemm_kernel_L4_Mv1_BEGIN: | |||
| /* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | |||
| mov counterI, #0 | |||
| whilelt p1.s, counterI, origM | |||
| cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension | |||
| .align 5 | |||
| .Lcgemm_kernel_L4_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x4 // fill with zeros | |||
| asr counterL , origK, #3 | |||
| cmp counterL , #2 | |||
| blt .Lcgemm_kernel_L4_Mv1_32 | |||
| KERNELv1x4_I | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble .Lcgemm_kernel_L4_Mv1_22a | |||
| .align 5 | |||
| .Lcgemm_kernel_L4_Mv1_22: | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt .Lcgemm_kernel_L4_Mv1_22 | |||
| .align 5 | |||
| .Lcgemm_kernel_L4_Mv1_22a: | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_E | |||
| b .Lcgemm_kernel_L4_Mv1_44 | |||
| .align 5 | |||
| .Lcgemm_kernel_L4_Mv1_32: | |||
| tst counterL, #1 | |||
| ble .Lcgemm_kernel_L4_Mv1_40 | |||
| KERNELv1x4_I | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_E | |||
| b .Lcgemm_kernel_L4_Mv1_44 | |||
| .Lcgemm_kernel_L4_Mv1_40: | |||
| INITv1x4 | |||
| .Lcgemm_kernel_L4_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Lcgemm_kernel_L4_Mv1_100 | |||
| .align 5 | |||
| .Lcgemm_kernel_L4_Mv1_46: | |||
| KERNELv1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne .Lcgemm_kernel_L4_Mv1_46 | |||
| .Lcgemm_kernel_L4_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x4 | |||
| .Lcgemm_kernel_L4_Mv1_END: | |||
| incw counterI | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension | |||
| b.any .Lcgemm_kernel_L4_Mv1_20 | |||
| .Lcgemm_kernel_L4_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 4 * 2 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt .Lcgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| .Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble .Lcgemm_kernel_L999 | |||
| tst counterJ , #2 | |||
| ble .Lcgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pC,pC,LDC, lsl #1 | |||
| mov pA, origPA // pA = A | |||
| .Lcgemm_kernel_L2_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| .Lcgemm_kernel_L2_Mv1_20: | |||
| INITv1x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble .Lcgemm_kernel_L2_Mv1_40 | |||
| .align 5 | |||
| .Lcgemm_kernel_L2_Mv1_22: | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Lcgemm_kernel_L2_Mv1_22 | |||
| .Lcgemm_kernel_L2_Mv1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble .Lcgemm_kernel_L2_Mv1_100 | |||
| .Lcgemm_kernel_L2_Mv1_42: | |||
| KERNELv1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Lcgemm_kernel_L2_Mv1_42 | |||
| .Lcgemm_kernel_L2_Mv1_100: | |||
| SAVEv1x2 | |||
| .Lcgemm_kernel_L2_Mv1_END: | |||
| incw counterI | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| b.any .Lcgemm_kernel_L2_Mv1_20 | |||
| .Lcgemm_kernel_L2_END: | |||
| lsl temp, origK, #4 | |||
| add origPB, origPB, temp // B = B + K * 2 * 4 * 2 | |||
| /******************************************************************************/ | |||
| .Lcgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble .Lcgemm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC , pC , LDC // Update pC to point to next | |||
| mov pA, origPA // pA = A | |||
| .Lcgemm_kernel_L1_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| .Lcgemm_kernel_L1_Mv1_20: | |||
| INITv1x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble .Lcgemm_kernel_L1_Mv1_40 | |||
| .align 5 | |||
| .Lcgemm_kernel_L1_Mv1_22: | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Lcgemm_kernel_L1_Mv1_22 | |||
| .Lcgemm_kernel_L1_Mv1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble .Lcgemm_kernel_L1_Mv1_100 | |||
| .Lcgemm_kernel_L1_Mv1_42: | |||
| KERNELv1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Lcgemm_kernel_L1_Mv1_42 | |||
| .Lcgemm_kernel_L1_Mv1_100: | |||
| SAVEv1x1 | |||
| .Lcgemm_kernel_L1_Mv1_END: | |||
| incw counterI | |||
| whilelt p1.s, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.s | |||
| b.any .Lcgemm_kernel_L1_Mv1_20 | |||
| .Lcgemm_kernel_L1_END: | |||
| /******************************************************************************/ | |||
| .Lcgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,79 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| svint32_t lda_vec = svindex_s32(0, lda * 2); | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, n); | |||
| uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint32_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat32_t a_vec_real = svld1_gather_index(pg, (float *) aoffset1, lda_vec); | |||
| svfloat32_t a_vec_imag = svld1_gather_index(pg, ((float *) aoffset1) + 1, lda_vec); | |||
| svst2_f32(pg, (float *) boffset, svcreate2(a_vec_real, a_vec_imag)); | |||
| aoffset1 += 2; | |||
| boffset += active * 2; | |||
| } | |||
| aoffset += active * lda * 2; | |||
| j += svcntw(); | |||
| pg = svwhilelt_b32(j, n); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,75 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, n); | |||
| uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint32_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat32x2_t a_vec = svld2(pg, (float *)aoffset1); | |||
| svst2_f32(pg, (float *) boffset, a_vec); | |||
| aoffset1 += lda * 2; | |||
| boffset += active * 2; | |||
| } | |||
| aoffset += active * 2; | |||
| j += svcntw(); | |||
| pg = svwhilelt_b32(j, n); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,320 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| static FLOAT dm1 = -1.; | |||
| #ifdef CONJ | |||
| #define GEMM_KERNEL GEMM_KERNEL_L | |||
| #else | |||
| #define GEMM_KERNEL GEMM_KERNEL_N | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 1 | |||
| #define GEMM_UNROLL_N_SHIFT 0 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 2 | |||
| #define GEMM_UNROLL_N_SHIFT 1 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 4 | |||
| #define GEMM_UNROLL_N_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 8 | |||
| #define GEMM_UNROLL_N_SHIFT 3 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 16 | |||
| #define GEMM_UNROLL_N_SHIFT 4 | |||
| #endif | |||
| #ifndef COMPLEX | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa, bb; | |||
| int i, j, k; | |||
| a += (m - 1) * m; | |||
| b += (m - 1) * n; | |||
| for (i = m - 1; i >= 0; i--) { | |||
| aa = *(a + i); | |||
| for (j = 0; j < n; j ++) { | |||
| bb = *(c + i + j * ldc); | |||
| bb *= aa; | |||
| *b = bb; | |||
| *(c + i + j * ldc) = bb; | |||
| b ++; | |||
| for (k = 0; k < i; k ++){ | |||
| *(c + k + j * ldc) -= bb * *(a + k); | |||
| } | |||
| } | |||
| a -= m; | |||
| b -= 2 * n; | |||
| } | |||
| } | |||
| #else | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa1, aa2; | |||
| FLOAT bb1, bb2; | |||
| FLOAT cc1, cc2; | |||
| int i, j, k; | |||
| ldc *= 2; | |||
| a += (m - 1) * m * 2; | |||
| b += (m - 1) * n * 2; | |||
| for (i = m - 1; i >= 0; i--) { | |||
| aa1 = *(a + i * 2 + 0); | |||
| aa2 = *(a + i * 2 + 1); | |||
| for (j = 0; j < n; j ++) { | |||
| bb1 = *(c + i * 2 + 0 + j * ldc); | |||
| bb2 = *(c + i * 2 + 1 + j * ldc); | |||
| #ifndef CONJ | |||
| cc1 = aa1 * bb1 - aa2 * bb2; | |||
| cc2 = aa1 * bb2 + aa2 * bb1; | |||
| #else | |||
| cc1 = aa1 * bb1 + aa2 * bb2; | |||
| cc2 = aa1 * bb2 - aa2 * bb1; | |||
| #endif | |||
| *(b + 0) = cc1; | |||
| *(b + 1) = cc2; | |||
| *(c + i * 2 + 0 + j * ldc) = cc1; | |||
| *(c + i * 2 + 1 + j * ldc) = cc2; | |||
| b += 2; | |||
| for (k = 0; k < i; k ++){ | |||
| #ifndef CONJ | |||
| *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); | |||
| *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | |||
| #else | |||
| *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); | |||
| *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | |||
| #endif | |||
| } | |||
| } | |||
| a -= m * 2; | |||
| b -= 4 * n; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
| #ifdef COMPLEX | |||
| FLOAT dummy2, | |||
| #endif | |||
| FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | |||
| BLASLONG i, j; | |||
| FLOAT *aa, *cc; | |||
| BLASLONG kk; | |||
| #ifdef DOUBLE | |||
| int sve_size = svcntd(); | |||
| #else | |||
| int sve_size = svcntw(); | |||
| #endif | |||
| #if 0 | |||
| fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", | |||
| m, n, k, offset); | |||
| #endif | |||
| j = (n >> GEMM_UNROLL_N_SHIFT); | |||
| while (j > 0) { | |||
| kk = m + offset; | |||
| i = m % sve_size; | |||
| if (i) { | |||
| aa = a + (m - i) * k * COMPSIZE; | |||
| cc = c + (m - i) * COMPSIZE; | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + i * kk * COMPSIZE, | |||
| b + GEMM_UNROLL_N * kk * COMPSIZE, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(i, GEMM_UNROLL_N, | |||
| aa + (kk - i) * i * COMPSIZE, | |||
| b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| kk -= i; | |||
| } | |||
| int mod = i; | |||
| i = sve_size; | |||
| if (i <= m) { | |||
| aa = a + (m - mod - sve_size) * k * COMPSIZE; | |||
| cc = c + (m - mod - sve_size) * COMPSIZE; | |||
| do { | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + sve_size * kk * COMPSIZE, | |||
| b + GEMM_UNROLL_N * kk * COMPSIZE, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(sve_size, GEMM_UNROLL_N, | |||
| aa + (kk - sve_size) * sve_size * COMPSIZE, | |||
| b + (kk - sve_size) * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| aa -= sve_size * k * COMPSIZE; | |||
| cc -= sve_size * COMPSIZE; | |||
| kk -= sve_size; | |||
| i += sve_size; | |||
| } while (i <= m); | |||
| } | |||
| b += GEMM_UNROLL_N * k * COMPSIZE; | |||
| c += GEMM_UNROLL_N * ldc * COMPSIZE; | |||
| j --; | |||
| } | |||
| if (n & (GEMM_UNROLL_N - 1)) { | |||
| j = (GEMM_UNROLL_N >> 1); | |||
| while (j > 0) { | |||
| if (n & j) { | |||
| kk = m + offset; | |||
| i = m % sve_size; | |||
| if (i) { | |||
| aa = a + (m - i) * k * COMPSIZE; | |||
| cc = c + (m - i) * COMPSIZE; | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(i, j, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + i * kk * COMPSIZE, | |||
| b + j * kk * COMPSIZE, | |||
| cc, ldc); | |||
| } | |||
| solve(i, j, | |||
| aa + (kk - i) * i * COMPSIZE, | |||
| b + (kk - i) * j * COMPSIZE, | |||
| cc, ldc); | |||
| kk -= i; | |||
| } | |||
| int mod = i; | |||
| i = sve_size; | |||
| if (i <= m) { | |||
| aa = a + (m - mod - sve_size) * k * COMPSIZE; | |||
| cc = c + (m - mod - sve_size) * COMPSIZE; | |||
| do { | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(sve_size, j, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + sve_size * kk * COMPSIZE, | |||
| b + j * kk * COMPSIZE, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(sve_size, j, | |||
| aa + (kk - sve_size) * sve_size * COMPSIZE, | |||
| b + (kk - sve_size) * j * COMPSIZE, | |||
| cc, ldc); | |||
| aa -= sve_size * k * COMPSIZE; | |||
| cc -= sve_size * COMPSIZE; | |||
| kk -= sve_size; | |||
| i += sve_size; | |||
| } while (i <= m); | |||
| } | |||
| b += j * k * COMPSIZE; | |||
| c += j * ldc * COMPSIZE; | |||
| } | |||
| j >>= 1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,295 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| static FLOAT dm1 = -1.; | |||
| #ifdef CONJ | |||
| #define GEMM_KERNEL GEMM_KERNEL_L | |||
| #else | |||
| #define GEMM_KERNEL GEMM_KERNEL_N | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 1 | |||
| #define GEMM_UNROLL_N_SHIFT 0 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 2 | |||
| #define GEMM_UNROLL_N_SHIFT 1 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 4 | |||
| #define GEMM_UNROLL_N_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 8 | |||
| #define GEMM_UNROLL_N_SHIFT 3 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 16 | |||
| #define GEMM_UNROLL_N_SHIFT 4 | |||
| #endif | |||
| #ifndef COMPLEX | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa, bb; | |||
| int i, j, k; | |||
| for (i = 0; i < m; i++) { | |||
| aa = *(a + i); | |||
| for (j = 0; j < n; j ++) { | |||
| bb = *(c + i + j * ldc); | |||
| bb *= aa; | |||
| *b = bb; | |||
| *(c + i + j * ldc) = bb; | |||
| b ++; | |||
| for (k = i + 1; k < m; k ++){ | |||
| *(c + k + j * ldc) -= bb * *(a + k); | |||
| } | |||
| } | |||
| a += m; | |||
| } | |||
| } | |||
| #else | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa1, aa2; | |||
| FLOAT bb1, bb2; | |||
| FLOAT cc1, cc2; | |||
| int i, j, k; | |||
| ldc *= 2; | |||
| for (i = 0; i < m; i++) { | |||
| aa1 = *(a + i * 2 + 0); | |||
| aa2 = *(a + i * 2 + 1); | |||
| for (j = 0; j < n; j ++) { | |||
| bb1 = *(c + i * 2 + 0 + j * ldc); | |||
| bb2 = *(c + i * 2 + 1 + j * ldc); | |||
| #ifndef CONJ | |||
| cc1 = aa1 * bb1 - aa2 * bb2; | |||
| cc2 = aa1 * bb2 + aa2 * bb1; | |||
| #else | |||
| cc1 = aa1 * bb1 + aa2 * bb2; | |||
| cc2 = aa1 * bb2 - aa2 * bb1; | |||
| #endif | |||
| *(b + 0) = cc1; | |||
| *(b + 1) = cc2; | |||
| *(c + i * 2 + 0 + j * ldc) = cc1; | |||
| *(c + i * 2 + 1 + j * ldc) = cc2; | |||
| b += 2; | |||
| for (k = i + 1; k < m; k ++){ | |||
| #ifndef CONJ | |||
| *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); | |||
| *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | |||
| #else | |||
| *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); | |||
| *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); | |||
| #endif | |||
| } | |||
| } | |||
| a += m * 2; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
| #ifdef COMPLEX | |||
| FLOAT dummy2, | |||
| #endif | |||
| FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | |||
| FLOAT *aa, *cc; | |||
| BLASLONG kk; | |||
| BLASLONG i, j, jj; | |||
| #ifdef DOUBLE | |||
| int sve_size = svcntd(); | |||
| #else | |||
| int sve_size = svcntw(); | |||
| #endif | |||
| #if 0 | |||
| fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", | |||
| m, n, k, offset); | |||
| #endif | |||
| jj = 0; | |||
| j = (n >> GEMM_UNROLL_N_SHIFT); | |||
| while (j > 0) { | |||
| kk = offset; | |||
| aa = a; | |||
| cc = c; | |||
| i = sve_size; | |||
| while (i <= m) { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, b, cc, ldc); | |||
| } | |||
| solve(sve_size, GEMM_UNROLL_N, | |||
| aa + kk * sve_size * COMPSIZE, | |||
| b + kk * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| aa += sve_size * k * COMPSIZE; | |||
| cc += sve_size * COMPSIZE; | |||
| kk += sve_size; | |||
| i += sve_size; | |||
| } | |||
| i = m % sve_size; | |||
| if (i) { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, b, cc, ldc); | |||
| } | |||
| solve(i, GEMM_UNROLL_N, | |||
| aa + kk * i * COMPSIZE, | |||
| b + kk * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| aa += i * k * COMPSIZE; | |||
| cc += i * COMPSIZE; | |||
| kk += i; | |||
| } | |||
| b += GEMM_UNROLL_N * k * COMPSIZE; | |||
| c += GEMM_UNROLL_N * ldc * COMPSIZE; | |||
| j --; | |||
| jj += sve_size; | |||
| } | |||
| if (n & (GEMM_UNROLL_N - 1)) { | |||
| j = (GEMM_UNROLL_N >> 1); | |||
| while (j > 0) { | |||
| if (n & j) { | |||
| kk = offset; | |||
| aa = a; | |||
| cc = c; | |||
| i = sve_size; | |||
| while (i <= m) { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(sve_size, j, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, | |||
| b, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(sve_size, j, | |||
| aa + kk * sve_size * COMPSIZE, | |||
| b + kk * j * COMPSIZE, cc, ldc); | |||
| aa += sve_size * k * COMPSIZE; | |||
| cc += sve_size * COMPSIZE; | |||
| kk += sve_size; | |||
| i += sve_size; | |||
| } | |||
| i = m % sve_size; | |||
| if (i) { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(i, j, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, | |||
| b, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(i, j, | |||
| aa + kk * i * COMPSIZE, | |||
| b + kk * j * COMPSIZE, cc, ldc); | |||
| aa += i * k * COMPSIZE; | |||
| cc += i * COMPSIZE; | |||
| kk += i; | |||
| } | |||
| b += j * k * COMPSIZE; | |||
| c += j * ldc * COMPSIZE; | |||
| } | |||
| j >>= 1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,293 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| static FLOAT dm1 = -1.; | |||
| #ifdef CONJ | |||
| #define GEMM_KERNEL GEMM_KERNEL_R | |||
| #else | |||
| #define GEMM_KERNEL GEMM_KERNEL_N | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 1 | |||
| #define GEMM_UNROLL_N_SHIFT 0 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 2 | |||
| #define GEMM_UNROLL_N_SHIFT 1 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 4 | |||
| #define GEMM_UNROLL_N_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 8 | |||
| #define GEMM_UNROLL_N_SHIFT 3 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 16 | |||
| #define GEMM_UNROLL_N_SHIFT 4 | |||
| #endif | |||
| #ifndef COMPLEX | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa, bb; | |||
| int i, j, k; | |||
| for (i = 0; i < n; i++) { | |||
| bb = *(b + i); | |||
| for (j = 0; j < m; j ++) { | |||
| aa = *(c + j + i * ldc); | |||
| aa *= bb; | |||
| *a = aa; | |||
| *(c + j + i * ldc) = aa; | |||
| a ++; | |||
| for (k = i + 1; k < n; k ++){ | |||
| *(c + j + k * ldc) -= aa * *(b + k); | |||
| } | |||
| } | |||
| b += n; | |||
| } | |||
| } | |||
| #else | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa1, aa2; | |||
| FLOAT bb1, bb2; | |||
| FLOAT cc1, cc2; | |||
| int i, j, k; | |||
| ldc *= 2; | |||
| for (i = 0; i < n; i++) { | |||
| bb1 = *(b + i * 2 + 0); | |||
| bb2 = *(b + i * 2 + 1); | |||
| for (j = 0; j < m; j ++) { | |||
| aa1 = *(c + j * 2 + 0 + i * ldc); | |||
| aa2 = *(c + j * 2 + 1 + i * ldc); | |||
| #ifndef CONJ | |||
| cc1 = aa1 * bb1 - aa2 * bb2; | |||
| cc2 = aa1 * bb2 + aa2 * bb1; | |||
| #else | |||
| cc1 = aa1 * bb1 + aa2 * bb2; | |||
| cc2 = -aa1 * bb2 + aa2 * bb1; | |||
| #endif | |||
| *(a + 0) = cc1; | |||
| *(a + 1) = cc2; | |||
| *(c + j * 2 + 0 + i * ldc) = cc1; | |||
| *(c + j * 2 + 1 + i * ldc) = cc2; | |||
| a += 2; | |||
| for (k = i + 1; k < n; k ++){ | |||
| #ifndef CONJ | |||
| *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); | |||
| *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
| #else | |||
| *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); | |||
| *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
| #endif | |||
| } | |||
| } | |||
| b += n * 2; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
| #ifdef COMPLEX | |||
| FLOAT dummy2, | |||
| #endif | |||
| FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | |||
| FLOAT *aa, *cc; | |||
| BLASLONG kk; | |||
| BLASLONG i, j, jj; | |||
| #ifdef DOUBLE | |||
| int sve_size = svcntd(); | |||
| #else | |||
| int sve_size = svcntw(); | |||
| #endif | |||
| #if 0 | |||
| fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", | |||
| m, n, k, offset); | |||
| #endif | |||
| jj = 0; | |||
| j = (n >> GEMM_UNROLL_N_SHIFT); | |||
| kk = -offset; | |||
| while (j > 0) { | |||
| aa = a; | |||
| cc = c; | |||
| i = sve_size; | |||
| if (i <= m) { | |||
| do { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(sve_size, GEMM_UNROLL_N, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, b, cc, ldc); | |||
| } | |||
| solve(sve_size, GEMM_UNROLL_N, | |||
| aa + kk * sve_size * COMPSIZE, | |||
| b + kk * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| aa += sve_size * k * COMPSIZE; | |||
| cc += sve_size * COMPSIZE; | |||
| i += sve_size; | |||
| } while (i <= m); | |||
| } | |||
| i = m % sve_size; | |||
| if (i) { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, b, cc, ldc); | |||
| } | |||
| solve(i, GEMM_UNROLL_N, | |||
| aa + kk * i * COMPSIZE, | |||
| b + kk * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| aa += i * k * COMPSIZE; | |||
| cc += i * COMPSIZE; | |||
| } | |||
| kk += GEMM_UNROLL_N; | |||
| b += GEMM_UNROLL_N * k * COMPSIZE; | |||
| c += GEMM_UNROLL_N * ldc * COMPSIZE; | |||
| j --; | |||
| jj += sve_size; | |||
| } | |||
| if (n & (GEMM_UNROLL_N - 1)) { | |||
| j = (GEMM_UNROLL_N >> 1); | |||
| while (j > 0) { | |||
| if (n & j) { | |||
| aa = a; | |||
| cc = c; | |||
| i = sve_size; | |||
| while (i <= m) { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(sve_size, j, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, | |||
| b, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(sve_size, j, | |||
| aa + kk * sve_size * COMPSIZE, | |||
| b + kk * j * COMPSIZE, cc, ldc); | |||
| aa += sve_size * k * COMPSIZE; | |||
| cc += sve_size * COMPSIZE; | |||
| i += sve_size; | |||
| } | |||
| i = m % sve_size; | |||
| if (i) { | |||
| if (kk > 0) { | |||
| GEMM_KERNEL(i, j, kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa, | |||
| b, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(i, j, | |||
| aa + kk * i * COMPSIZE, | |||
| b + kk * j * COMPSIZE, cc, ldc); | |||
| aa += i * k * COMPSIZE; | |||
| cc += i * COMPSIZE; | |||
| } | |||
| b += j * k * COMPSIZE; | |||
| c += j * ldc * COMPSIZE; | |||
| kk += j; | |||
| } | |||
| j >>= 1; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,317 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| static FLOAT dm1 = -1.; | |||
| #ifdef CONJ | |||
| #define GEMM_KERNEL GEMM_KERNEL_R | |||
| #else | |||
| #define GEMM_KERNEL GEMM_KERNEL_N | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 1 | |||
| #define GEMM_UNROLL_N_SHIFT 0 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 2 | |||
| #define GEMM_UNROLL_N_SHIFT 1 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 4 | |||
| #define GEMM_UNROLL_N_SHIFT 2 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 8 | |||
| #define GEMM_UNROLL_N_SHIFT 3 | |||
| #endif | |||
| #if GEMM_DEFAULT_UNROLL_N == 16 | |||
| #define GEMM_UNROLL_N_SHIFT 4 | |||
| #endif | |||
| #ifndef COMPLEX | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa, bb; | |||
| int i, j, k; | |||
| a += (n - 1) * m; | |||
| b += (n - 1) * n; | |||
| for (i = n - 1; i >= 0; i--) { | |||
| bb = *(b + i); | |||
| for (j = 0; j < m; j ++) { | |||
| aa = *(c + j + i * ldc); | |||
| aa *= bb; | |||
| *a = aa; | |||
| *(c + j + i * ldc) = aa; | |||
| a ++; | |||
| for (k = 0; k < i; k ++){ | |||
| *(c + j + k * ldc) -= aa * *(b + k); | |||
| } | |||
| } | |||
| b -= n; | |||
| a -= 2 * m; | |||
| } | |||
| } | |||
| #else | |||
| static inline void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT aa1, aa2; | |||
| FLOAT bb1, bb2; | |||
| FLOAT cc1, cc2; | |||
| int i, j, k; | |||
| ldc *= 2; | |||
| a += (n - 1) * m * 2; | |||
| b += (n - 1) * n * 2; | |||
| for (i = n - 1; i >= 0; i--) { | |||
| bb1 = *(b + i * 2 + 0); | |||
| bb2 = *(b + i * 2 + 1); | |||
| for (j = 0; j < m; j ++) { | |||
| aa1 = *(c + j * 2 + 0 + i * ldc); | |||
| aa2 = *(c + j * 2 + 1 + i * ldc); | |||
| #ifndef CONJ | |||
| cc1 = aa1 * bb1 - aa2 * bb2; | |||
| cc2 = aa1 * bb2 + aa2 * bb1; | |||
| #else | |||
| cc1 = aa1 * bb1 + aa2 * bb2; | |||
| cc2 = - aa1 * bb2 + aa2 * bb1; | |||
| #endif | |||
| *(a + 0) = cc1; | |||
| *(a + 1) = cc2; | |||
| *(c + j * 2 + 0 + i * ldc) = cc1; | |||
| *(c + j * 2 + 1 + i * ldc) = cc2; | |||
| a += 2; | |||
| for (k = 0; k < i; k ++){ | |||
| #ifndef CONJ | |||
| *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); | |||
| *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
| #else | |||
| *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); | |||
| *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); | |||
| #endif | |||
| } | |||
| } | |||
| b -= n * 2; | |||
| a -= 4 * m; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, | |||
| #ifdef COMPLEX | |||
| FLOAT dummy2, | |||
| #endif | |||
| FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ | |||
| BLASLONG i, j; | |||
| FLOAT *aa, *cc; | |||
| BLASLONG kk; | |||
| #ifdef DOUBLE | |||
| int sve_size = svcntd(); | |||
| #else | |||
| int sve_size = svcntw(); | |||
| #endif | |||
| #if 0 | |||
| fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", | |||
| m, n, k, offset); | |||
| #endif | |||
| kk = n - offset; | |||
| c += n * ldc * COMPSIZE; | |||
| b += n * k * COMPSIZE; | |||
| if (n & (GEMM_UNROLL_N - 1)) { | |||
| j = 1; | |||
| while (j < GEMM_UNROLL_N) { | |||
| if (n & j) { | |||
| aa = a; | |||
| b -= j * k * COMPSIZE; | |||
| c -= j * ldc* COMPSIZE; | |||
| cc = c; | |||
| i = sve_size; | |||
| if (i <= m) { | |||
| do { | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(sve_size, j, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + sve_size * kk * COMPSIZE, | |||
| b + j * kk * COMPSIZE, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(sve_size, j, | |||
| aa + (kk - j) * sve_size * COMPSIZE, | |||
| b + (kk - j) * j * COMPSIZE, | |||
| cc, ldc); | |||
| aa += sve_size * k * COMPSIZE; | |||
| cc += sve_size * COMPSIZE; | |||
| i += sve_size; | |||
| } while (i <= m); | |||
| } | |||
| i = m % sve_size; | |||
| if (i) { | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(i, j, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + i * kk * COMPSIZE, | |||
| b + j * kk * COMPSIZE, | |||
| cc, ldc); | |||
| } | |||
| solve(i, j, | |||
| aa + (kk - j) * i * COMPSIZE, | |||
| b + (kk - j) * j * COMPSIZE, | |||
| cc, ldc); | |||
| aa += i * k * COMPSIZE; | |||
| cc += i * COMPSIZE; | |||
| } | |||
| kk -= j; | |||
| } | |||
| j <<= 1; | |||
| } | |||
| } | |||
| j = (n >> GEMM_UNROLL_N_SHIFT); | |||
| if (j > 0) { | |||
| do { | |||
| aa = a; | |||
| b -= GEMM_UNROLL_N * k * COMPSIZE; | |||
| c -= GEMM_UNROLL_N * ldc * COMPSIZE; | |||
| cc = c; | |||
| i = sve_size; | |||
| if (i <= m) { | |||
| do { | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(sve_size, GEMM_UNROLL_N, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + sve_size * kk * COMPSIZE, | |||
| b + GEMM_UNROLL_N * kk * COMPSIZE, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(sve_size, GEMM_UNROLL_N, | |||
| aa + (kk - GEMM_UNROLL_N) * sve_size * COMPSIZE, | |||
| b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| aa += sve_size * k * COMPSIZE; | |||
| cc += sve_size * COMPSIZE; | |||
| i += sve_size; | |||
| } while (i <= m); | |||
| } | |||
| i = m % sve_size; | |||
| if (i) { | |||
| if (k - kk > 0) { | |||
| GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, | |||
| #ifdef COMPLEX | |||
| ZERO, | |||
| #endif | |||
| aa + i * kk * COMPSIZE, | |||
| b + GEMM_UNROLL_N * kk * COMPSIZE, | |||
| cc, | |||
| ldc); | |||
| } | |||
| solve(i, GEMM_UNROLL_N, | |||
| aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, | |||
| b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, | |||
| cc, ldc); | |||
| aa += i * k * COMPSIZE; | |||
| cc += i * COMPSIZE; | |||
| } | |||
| kk -= GEMM_UNROLL_N; | |||
| j --; | |||
| } while (j > 0); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,119 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj; | |||
| FLOAT *ao; | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| ao = a; | |||
| i = 0; | |||
| ii = 0; | |||
| do { | |||
| if (ii == jj) { | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0; k < j; k++) { | |||
| *(b + j * n_active + k) = *(ao + k * lda + j); | |||
| } | |||
| *(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
| } | |||
| ao += n_active; | |||
| b += n_active * n_active; | |||
| i += n_active; | |||
| ii += n_active; | |||
| } else { | |||
| if (ii > jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #else | |||
| svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| } | |||
| ao++; | |||
| b += n_active; | |||
| i++; | |||
| ii++; | |||
| } | |||
| } while (i < m); | |||
| a += n_active * lda; | |||
| jj += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,117 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj; | |||
| FLOAT *ao; | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| ao = a; | |||
| i = 0; | |||
| ii = 0; | |||
| do { | |||
| if (ii == jj) { | |||
| for (int j = 0; j < n_active; j++) { | |||
| *(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
| for (int k = j+1; k < n_active; k++) { | |||
| *(b + j * n_active + k) = *(ao + j * lda + k); | |||
| } | |||
| } | |||
| b += n_active * n_active; | |||
| ao += lda * n_active; | |||
| i += n_active; | |||
| ii += n_active; | |||
| } else { | |||
| if (ii < jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1(pn, ao); | |||
| #else | |||
| svfloat32_t aj_vec = svld1(pn, ao); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| } | |||
| ao += lda; | |||
| b += n_active; | |||
| i ++; | |||
| ii ++; | |||
| } | |||
| } while (i < m); | |||
| a += n_active; | |||
| jj += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,119 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj; | |||
| FLOAT *ao; | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| ao = a; | |||
| i = 0; | |||
| ii = 0; | |||
| do { | |||
| if (ii == jj) { | |||
| for (int j = 0; j < n_active; j++) { | |||
| *(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
| for (int k = j+1; k < n_active; k++) { | |||
| *(b + j * n_active + k) = *(ao + k * lda + j); | |||
| } | |||
| } | |||
| ao += n_active; | |||
| b += n_active * n_active; | |||
| i += n_active; | |||
| ii += n_active; | |||
| } else { | |||
| if (ii < jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #else | |||
| svfloat32_t aj_vec = svld1_gather_index(pn, ao, index); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| } | |||
| ao++; | |||
| b += n_active; | |||
| i++; | |||
| ii++; | |||
| } | |||
| } while (i < m); | |||
| a += n_active * lda; | |||
| jj += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,117 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| #ifndef UNIT | |||
| #define INV(a) (ONE / (a)) | |||
| #else | |||
| #define INV(a) (ONE) | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj; | |||
| FLOAT *ao; | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| ao = a; | |||
| i = 0; | |||
| ii = 0; | |||
| do { | |||
| if (ii == jj) { | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0; k < j; k++) { | |||
| *(b + j * n_active + k) = *(ao + j * lda + k); | |||
| } | |||
| *(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
| } | |||
| ao += lda * n_active; | |||
| b += n_active * n_active; | |||
| i += n_active; | |||
| ii += n_active; | |||
| } else { | |||
| if (ii > jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec = svld1(pn, ao); | |||
| #else | |||
| svfloat32_t aj_vec = svld1(pn, ao); | |||
| #endif | |||
| svst1(pn, b, aj_vec); | |||
| } | |||
| ao += lda; | |||
| b += n_active; | |||
| i ++; | |||
| ii ++; | |||
| } | |||
| } while (i < m); | |||
| a += n_active; | |||
| jj += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,874 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* X0 X1 X2 s0 X3 x4 x5 x6 */ | |||
| /*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ | |||
| #define origM x0 | |||
| #define origN x1 | |||
| #define origK x2 | |||
| #define origPA x3 | |||
| #define origPB x4 | |||
| #define pC x5 | |||
| #define LDC x6 | |||
| #define temp x7 | |||
| #define counterL x8 | |||
| #define counterI x9 | |||
| #define counterJ x10 | |||
| #define pB x11 | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define lanes x17 | |||
| #define alphaR x19 | |||
| #define alphaI x20 | |||
| #define alphaz_R z6.d | |||
| #define alphaz_I z7.d | |||
| #define alpha0_R d6 | |||
| #define alpha0_I d7 | |||
| #define A_PRE_SIZE 2560 | |||
| #define B_PRE_SIZE 448 | |||
| #define C_PRE_SIZE 128 | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define OP_rr fmla | |||
| #define OP_ii fmls | |||
| #define OP_ri fmla | |||
| #define OP_ir fmla | |||
| #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||
| #define OP_rr fmla | |||
| #define OP_ii fmla | |||
| #define OP_ri fmls | |||
| #define OP_ir fmla | |||
| #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||
| #define OP_rr fmla | |||
| #define OP_ii fmla | |||
| #define OP_ri fmla | |||
| #define OP_ir fmls | |||
| #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| #define OP_rr fmla | |||
| #define OP_ii fmls | |||
| #define OP_ri fmls | |||
| #define OP_ir fmls | |||
| #endif | |||
| // 00 origM | |||
| // 01 origN | |||
| // 02 origK | |||
| // 03 origPA | |||
| // 04 origPB | |||
| // 05 pC | |||
| // 06 origLDC -> LDC | |||
| // 07 offset -> temp | |||
| // 08 counterL | |||
| // 09 counterI | |||
| // 10 counterJ | |||
| // 11 pB | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 pCRow3 | |||
| // 16 pA | |||
| // 17 alpha_save_R | |||
| // 18 must save alpha_save_I | |||
| // 19 must save | |||
| // 20 must save | |||
| // 21 must save | |||
| // 22 must save | |||
| // 23 must save | |||
| // 24 must save | |||
| // 25 must save | |||
| // 26 must save | |||
| // 27 must save | |||
| // 28 must save | |||
| // 29 frame | |||
| // 30 link | |||
| // 31 sp | |||
| //v00 ALPHA_R -> pA00_R, pA01_R | |||
| //v01 ALPHA_I -> pA00_I, pA01_I | |||
| //v02 pA02_R, pA03_R | |||
| //v03 pA02_I, pA03_I | |||
| //v04 pA10_R, pA11_R | |||
| //v05 pA10_I, pA11_I | |||
| //v06 pA12_R, pA13_R | |||
| //v07 pA12_I, pA13_I | |||
| //v08 must save pB00_R, pB01_R | |||
| //v09 must save pB00_I, pB01_I | |||
| //v10 must save pB02_R, pB03_R OR ALPHA0_R | |||
| //v11 must save pB02_I, pB03_I OR ALPHA0_I | |||
| //v12 must save pB10_R, pB11_R | |||
| //v13 must save pB10_I, pB11_I | |||
| //v14 must save pB12_R, pB13_R OR ALPHA1_R | |||
| //v15 must save pB12_I, pB13_I OR ALPHA1_R | |||
| //v16 pC0R | |||
| //v17 pC0I | |||
| //v18 pC1R | |||
| //v19 pC1I | |||
| //v20 pC2R | |||
| //v21 pC2I | |||
| //v22 pC3R | |||
| //v23 pC3I | |||
| //v24 pC3R | |||
| //v25 pC3I | |||
| //v26 pC22_R, pC23_R | |||
| //v27 pC22_I, pC23_I | |||
| //v28 pC30_R, pC31_R | |||
| //v29 pC30_I, pC31_I | |||
| //v30 pC32_R, pC33_R | |||
| //v31 pC32_I, pC33_I | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| .macro INITv1x4 | |||
| dup z16.d, #0 | |||
| dup z17.d, #0 | |||
| dup z18.d, #0 | |||
| dup z19.d, #0 | |||
| dup z20.d, #0 | |||
| dup z21.d, #0 | |||
| dup z22.d, #0 | |||
| dup z23.d, #0 | |||
| .endm | |||
| .macro KERNELv1x4_I | |||
| ld2d {z0.d, z1.d}, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #4 // pA += lanes*2*8 | |||
| ld2d {z2.d, z3.d}, p1/z, [pA] // next one | |||
| add pA, pA, lanes, lsl #4 // pA += lanes*2*8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| fmla z16.d, p1/m, z0.d, z8.d | |||
| OP_ir z17.d, p1/m, z1.d, z8.d | |||
| ld1rd z8.d, p0/z, [pB] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| #eor z17.16b, z17.16b, z17.16b | |||
| fmls z17.d, p1/m, z0.d, z9.d | |||
| #else | |||
| fmla z17.d, p1/m, z0.d, z9.d | |||
| #endif | |||
| OP_ii z16.d, p1/m, z1.d, z9.d | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| fmla z18.d, p1/m, z0.d, z10.d | |||
| OP_ir z19.d, p1/m, z1.d, z10.d | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| OP_ii z18.d, p1/m, z1.d, z11.d | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| #eor z19.16b, z21.16b, z21.16b | |||
| fmls z19.d, p1/m, z0.d, z11.d | |||
| #else | |||
| fmla z19.d, p1/m, z0.d, z11.d | |||
| #endif | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| fmla z20.d, p1/m, z0.d, z12.d | |||
| OP_ir z21.d, p1/m, z1.d, z12.d | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| #eor z21.16b, z23.16b, z23.16b | |||
| fmls z21.d, p1/m, z0.d, z13.d | |||
| #else | |||
| fmla z21.d, p1/m, z0.d, z13.d | |||
| #endif | |||
| OP_ii z20.d, p1/m, z1.d, z13.d | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| fmla z22.d, p1/m, z0.d, z14.d | |||
| OP_ir z23.d, p1/m, z1.d, z14.d | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| #eor z23.16b, z19.16b, z19.16b | |||
| fmls z23.d, p1/m, z0.d, z15.d | |||
| #else | |||
| fmla z23.d, p1/m, z0.d, z15.d | |||
| #endif | |||
| OP_ii z22.d, p1/m, z1.d, z15.d | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_M1 | |||
| ld2d {z2.d, z3.d}, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 | |||
| OP_rr z16.d, p1/m, z0.d, z8.d | |||
| OP_ir z17.d, p1/m, z1.d, z8.d | |||
| ld1rd z8.d, p0/z, [pB] | |||
| OP_ii z16.d, p1/m, z1.d, z9.d | |||
| OP_ri z17.d, p1/m, z0.d, z9.d | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| OP_rr z18.d, p1/m, z0.d, z10.d | |||
| OP_ir z19.d, p1/m, z1.d, z10.d | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| OP_ii z18.d, p1/m, z1.d, z11.d | |||
| OP_ri z19.d, p1/m, z0.d, z11.d | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| OP_rr z20.d, p1/m, z0.d, z12.d | |||
| OP_ir z21.d, p1/m, z1.d, z12.d | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| OP_ii z20.d, p1/m, z1.d, z13.d | |||
| OP_ri z21.d, p1/m, z0.d, z13.d | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| OP_rr z22.d, p1/m, z0.d, z14.d | |||
| OP_ir z23.d, p1/m, z1.d, z14.d | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| OP_ii z22.d, p1/m, z1.d, z15.d | |||
| OP_ri z23.d, p1/m, z0.d, z15.d | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_M2 | |||
| ld2d {z0.d, z1.d}, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #4 // pA = pA + lanes *2 * 8 | |||
| OP_rr z16.d, p1/m, z2.d, z8.d | |||
| OP_ir z17.d, p1/m, z3.d, z8.d | |||
| ld1rd z8.d, p0/z, [pB] | |||
| OP_ii z16.d, p1/m, z3.d, z9.d | |||
| OP_ri z17.d, p1/m, z2.d, z9.d | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| OP_rr z18.d, p1/m, z2.d, z10.d | |||
| OP_ir z19.d, p1/m, z3.d, z10.d | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| OP_ii z18.d, p1/m, z3.d, z11.d | |||
| OP_ri z19.d, p1/m, z2.d, z11.d | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| OP_rr z20.d, p1/m, z2.d, z12.d | |||
| OP_ir z21.d, p1/m, z3.d, z12.d | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| OP_ii z20.d, p1/m, z3.d, z13.d | |||
| OP_ri z21.d, p1/m, z2.d, z13.d | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| OP_rr z22.d, p1/m, z2.d, z14.d | |||
| OP_ir z23.d, p1/m, z3.d, z14.d | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| OP_ii z22.d, p1/m, z3.d, z15.d | |||
| OP_ri z23.d, p1/m, z2.d, z15.d | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| add pB, pB, 64 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_E | |||
| OP_rr z16.d, p1/m, z2.d, z8.d | |||
| OP_ir z17.d, p1/m, z3.d, z8.d | |||
| OP_ii z16.d, p1/m, z3.d, z9.d | |||
| OP_ri z17.d, p1/m, z2.d, z9.d | |||
| OP_rr z18.d, p1/m, z2.d, z10.d | |||
| OP_ir z19.d, p1/m, z3.d, z10.d | |||
| OP_ii z18.d, p1/m, z3.d, z11.d | |||
| OP_ri z19.d, p1/m, z2.d, z11.d | |||
| OP_rr z20.d, p1/m, z2.d, z12.d | |||
| OP_ir z21.d, p1/m, z3.d, z12.d | |||
| OP_ii z20.d, p1/m, z3.d, z13.d | |||
| OP_ri z21.d, p1/m, z2.d, z13.d | |||
| OP_rr z22.d, p1/m, z2.d, z14.d | |||
| OP_ir z23.d, p1/m, z3.d, z14.d | |||
| OP_ii z22.d, p1/m, z3.d, z15.d | |||
| OP_ri z23.d, p1/m, z2.d, z15.d | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_SUB | |||
| ld2d {z0.d, z1.d}, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| OP_rr z16.d, p1/m, z0.d, z8.d | |||
| OP_ir z17.d, p1/m, z1.d, z8.d | |||
| OP_ii z16.d, p1/m, z1.d, z9.d | |||
| OP_ri z17.d, p1/m, z0.d, z9.d | |||
| ld1rd z12.d, p0/z, [pB, 32] | |||
| ld1rd z13.d, p0/z, [pB, 40] | |||
| ld1rd z14.d, p0/z, [pB, 48] | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| OP_rr z18.d, p1/m, z0.d, z10.d | |||
| OP_ir z19.d, p1/m, z1.d, z10.d | |||
| OP_ii z18.d, p1/m, z1.d, z11.d | |||
| OP_ri z19.d, p1/m, z0.d, z11.d | |||
| add pB, pB, 64 | |||
| OP_rr z20.d, p1/m, z0.d, z12.d | |||
| OP_ir z21.d, p1/m, z1.d, z12.d | |||
| OP_ii z20.d, p1/m, z1.d, z13.d | |||
| OP_ri z21.d, p1/m, z0.d, z13.d | |||
| OP_rr z22.d, p1/m, z0.d, z14.d | |||
| OP_ir z23.d, p1/m, z1.d, z14.d | |||
| OP_ii z22.d, p1/m, z1.d, z15.d | |||
| OP_ri z23.d, p1/m, z0.d, z15.d | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| .endm | |||
| .macro SAVEv1x4 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2d {z24.d, z25.d}, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaz_R | |||
| fmls z24.d, p1/m, z17.d, alphaz_I | |||
| fmla z25.d, p1/m, z16.d, alphaz_I | |||
| fmla z25.d, p1/m, z17.d, alphaz_R | |||
| st2d {z24.d, z25.d}, p1, [pCRow0] | |||
| add pCRow0, pCRow0, lanes, lsl #4 | |||
| ld2d {z26.d, z27.d}, p1/z, [pCRow1] | |||
| fmla z26.d, p1/m, z18.d, alphaz_R | |||
| fmls z26.d, p1/m, z19.d, alphaz_I | |||
| fmla z27.d, p1/m, z18.d, alphaz_I | |||
| fmla z27.d, p1/m, z19.d, alphaz_R | |||
| st2d {z26.d, z27.d}, p1, [pCRow1] | |||
| add pCRow1, pCRow1, lanes, lsl #4 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld2d {z28.d, z29.d}, p1/z, [pCRow2] | |||
| fmla z28.d, p1/m, z20.d, alphaz_R | |||
| fmls z28.d, p1/m, z21.d, alphaz_I | |||
| fmla z29.d, p1/m, z20.d, alphaz_I | |||
| fmla z29.d, p1/m, z21.d, alphaz_R | |||
| st2d {z28.d, z29.d}, p1, [pCRow2] | |||
| add pCRow2, pCRow2, lanes, lsl #4 | |||
| ld2d {z30.d, z31.d}, p1/z, [pCRow3] | |||
| fmla z30.d, p1/m, z22.d, alphaz_R | |||
| fmls z30.d, p1/m, z23.d, alphaz_I | |||
| fmla z31.d, p1/m, z22.d, alphaz_I | |||
| fmla z31.d, p1/m, z23.d, alphaz_R | |||
| st2d {z30.d, z31.d}, p1, [pCRow3] | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x2 | |||
| dup z16.d, #0 | |||
| dup z17.d, #0 | |||
| dup z18.d, #0 | |||
| dup z19.d, #0 | |||
| .endm | |||
| .macro KERNELv1x2_SUB | |||
| ld2d {z0.d, z1.d}, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| ld1rd z10.d, p0/z, [pB, 16] | |||
| ld1rd z11.d, p0/z, [pB, 24] | |||
| OP_rr z16.d, p1/m, z0.d, z8.d | |||
| OP_ir z17.d, p1/m, z1.d, z8.d | |||
| OP_ii z16.d, p1/m, z1.d, z9.d | |||
| OP_ri z17.d, p1/m, z0.d, z9.d | |||
| OP_rr z18.d, p1/m, z0.d, z10.d | |||
| OP_ir z19.d, p1/m, z1.d, z10.d | |||
| OP_ii z18.d, p1/m, z1.d, z11.d | |||
| OP_ri z19.d, p1/m, z0.d, z11.d | |||
| add pB, pB, 32 | |||
| .endm | |||
| .macro SAVEv1x2 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2d {z24.d, z25.d}, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaz_R | |||
| fmls z24.d, p1/m, z17.d, alphaz_I | |||
| fmla z25.d, p1/m, z16.d, alphaz_I | |||
| fmla z25.d, p1/m, z17.d, alphaz_R | |||
| st2d {z24.d, z25.d}, p1, [pCRow0] | |||
| add pCRow0, pCRow0, lanes, lsl #4 | |||
| ld2d {z26.d, z27.d}, p1/z, [pCRow1] | |||
| fmla z26.d, p1/m, z18.d, alphaz_R | |||
| fmls z26.d, p1/m, z19.d, alphaz_I | |||
| fmla z27.d, p1/m, z18.d, alphaz_I | |||
| fmla z27.d, p1/m, z19.d, alphaz_R | |||
| st2d {z26.d, z27.d}, p1, [pCRow1] | |||
| add pCRow1, pCRow1, lanes, lsl #4 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| .macro INITv1x1 | |||
| dup z16.d, #0 | |||
| dup z17.d, #0 | |||
| .endm | |||
| .macro KERNELv1x1_SUB | |||
| ld2d {z0.d, z1.d}, p1/z, [pA] | |||
| add pA, pA, lanes, lsl #4 // pA = pA + lanes* 2 * 8 | |||
| ld1rd z8.d, p0/z, [pB] | |||
| ld1rd z9.d, p0/z, [pB, 8] | |||
| add pB, pB, 16 | |||
| OP_rr z16.d, p1/m, z0.d, z8.d | |||
| OP_ir z17.d, p1/m, z1.d, z8.d | |||
| OP_ii z16.d, p1/m, z1.d, z9.d | |||
| OP_ri z17.d, p1/m, z0.d, z9.d | |||
| .endm | |||
| .macro SAVEv1x1 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2d {z24.d, z25.d}, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaz_R | |||
| fmls z24.d, p1/m, z17.d, alphaz_I | |||
| fmla z25.d, p1/m, z16.d, alphaz_I | |||
| fmla z25.d, p1/m, z17.d, alphaz_R | |||
| st2d {z24.d, z25.d}, p1, [pCRow0] | |||
| add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alphaR, d0 | |||
| dup alphaz_R, alphaR | |||
| fmov alphaI, d1 | |||
| dup alphaz_I, alphaI | |||
| lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | |||
| ptrue p0.d // create true predicate | |||
| mov pB, origPB | |||
| // Loop over N | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble .Lzgemm_kernel_L2_BEGIN | |||
| /******************************************************************************/ | |||
| .Lzgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow3, pCRow2, LDC | |||
| add pC, pCRow3, LDC | |||
| mov pA, origPA // pA = start of A array | |||
| .Lzgemm_kernel_L4_Mv1_BEGIN: | |||
| /* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM | |||
| cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | |||
| .align 5 | |||
| .Lzgemm_kernel_L4_Mv1_20: | |||
| mov pB, origPB | |||
| INITv1x4 // fill with zeros | |||
| asr counterL , origK, #3 | |||
| cmp counterL , #2 | |||
| blt .Lzgemm_kernel_L4_Mv1_32 | |||
| KERNELv1x4_I | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble .Lzgemm_kernel_L4_Mv1_22a | |||
| .align 5 | |||
| .Lzgemm_kernel_L4_Mv1_22: | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt .Lzgemm_kernel_L4_Mv1_22 | |||
| .align 5 | |||
| .Lzgemm_kernel_L4_Mv1_22a: | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_E | |||
| b .Lzgemm_kernel_L4_Mv1_44 | |||
| .align 5 | |||
| .Lzgemm_kernel_L4_Mv1_32: | |||
| tst counterL, #1 | |||
| ble .Lzgemm_kernel_L4_Mv1_40 | |||
| KERNELv1x4_I | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_M2 | |||
| KERNELv1x4_M1 | |||
| KERNELv1x4_E | |||
| b .Lzgemm_kernel_L4_Mv1_44 | |||
| .Lzgemm_kernel_L4_Mv1_40: | |||
| INITv1x4 | |||
| .Lzgemm_kernel_L4_Mv1_44: | |||
| ands counterL , origK, #7 | |||
| ble .Lzgemm_kernel_L4_Mv1_100 | |||
| .align 5 | |||
| .Lzgemm_kernel_L4_Mv1_46: | |||
| KERNELv1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne .Lzgemm_kernel_L4_Mv1_46 | |||
| .Lzgemm_kernel_L4_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x4 | |||
| .Lzgemm_kernel_L4_Mv1_END: | |||
| incd counterI | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | |||
| b.any .Lzgemm_kernel_L4_Mv1_20 | |||
| .Lzgemm_kernel_L4_END: | |||
| lsl temp, origK, #6 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 * 2 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt .Lzgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| .Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble .Lzgemm_kernel_L999 | |||
| tst counterJ , #2 | |||
| ble .Lzgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pC,pC,LDC, lsl #1 | |||
| mov pA, origPA // pA = A | |||
| .Lzgemm_kernel_L2_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| .Lzgemm_kernel_L2_Mv1_20: | |||
| INITv1x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble .Lzgemm_kernel_L2_Mv1_40 | |||
| .align 5 | |||
| .Lzgemm_kernel_L2_Mv1_22: | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| KERNELv1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Lzgemm_kernel_L2_Mv1_22 | |||
| .Lzgemm_kernel_L2_Mv1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble .Lzgemm_kernel_L2_Mv1_100 | |||
| .Lzgemm_kernel_L2_Mv1_42: | |||
| KERNELv1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Lzgemm_kernel_L2_Mv1_42 | |||
| .Lzgemm_kernel_L2_Mv1_100: | |||
| SAVEv1x2 | |||
| .Lzgemm_kernel_L2_Mv1_END: | |||
| incd counterI | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| b.any .Lzgemm_kernel_L2_Mv1_20 | |||
| .Lzgemm_kernel_L2_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 2 * 8 * 2 | |||
| /******************************************************************************/ | |||
| .Lzgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble .Lzgemm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC , pC , LDC // Update pC to point to next | |||
| mov pA, origPA // pA = A | |||
| .Lzgemm_kernel_L1_Mv1_BEGIN: | |||
| mov counterI, #0 | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| .Lzgemm_kernel_L1_Mv1_20: | |||
| INITv1x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble .Lzgemm_kernel_L1_Mv1_40 | |||
| .align 5 | |||
| .Lzgemm_kernel_L1_Mv1_22: | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| KERNELv1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Lzgemm_kernel_L1_Mv1_22 | |||
| .Lzgemm_kernel_L1_Mv1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble .Lzgemm_kernel_L1_Mv1_100 | |||
| .Lzgemm_kernel_L1_Mv1_42: | |||
| KERNELv1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt .Lzgemm_kernel_L1_Mv1_42 | |||
| .Lzgemm_kernel_L1_Mv1_100: | |||
| SAVEv1x1 | |||
| .Lzgemm_kernel_L1_Mv1_END: | |||
| incd counterI | |||
| whilelt p1.d, counterI, origM //SVE instruction | |||
| cntp lanes, p0, p1.d | |||
| b.any .Lzgemm_kernel_L1_Mv1_20 | |||
| .Lzgemm_kernel_L1_END: | |||
| /******************************************************************************/ | |||
| .Lzgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,79 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| svint64_t lda_vec = svindex_s64(0LL, lda * 2); | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint64_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat64_t a_vec_real = svld1_gather_index(pg, (double *) aoffset1, lda_vec); | |||
| svfloat64_t a_vec_imag = svld1_gather_index(pg, ((double *) aoffset1) + 1, lda_vec); | |||
| svst2_f64(pg, (double *) boffset, svcreate2(a_vec_real, a_vec_imag)); | |||
| aoffset1 += 2; | |||
| boffset += active * 2; | |||
| } | |||
| aoffset += active * lda * 2; | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,75 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| // TODO: write in assembly with proper unrolling of inner loop | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG j; | |||
| IFLOAT *aoffset, *aoffset1, *boffset; | |||
| aoffset = a; | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| aoffset1 = aoffset; | |||
| uint64_t i_cnt = m; | |||
| while (i_cnt--) { | |||
| svfloat64x2_t a_vec = svld2(pg, (double *)aoffset1); | |||
| svst2_f64(pg, (double *) boffset, a_vec); | |||
| aoffset1 += lda * 2; | |||
| boffset += active * 2; | |||
| } | |||
| aoffset += active * 2; | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,172 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| #if defined(DOUBLE) | |||
| BLASLONG offset, i; | |||
| lda *= 2; | |||
| uint64_t sve_size = svcntd(); | |||
| svint64_t posY_vec = svdup_s64(posY); | |||
| svint64_t posX_vec = svdup_s64(posX); | |||
| svint64_t lda_vec = svdup_s64(lda); | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| do { | |||
| offset = posX - posY; | |||
| svint64_t vec_off = svdup_s64(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint64_t temp = svadd_z(pg, posX_vec, index); | |||
| svint64_t temp1 = svmul_z(pg, temp, 2); | |||
| temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); | |||
| svint64_t temp2 = svmul_z(pg, temp, lda_vec); | |||
| temp2 = svmla_z(pg, temp2, posY_vec, 2); | |||
| svint64_t gat_ind = svsel(cmp, temp1, temp2); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
| svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | |||
| if (offset <= 0) { | |||
| svbool_t off_g = svwhilelt_b64(offset, 0LL); | |||
| data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
| } | |||
| svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
| // dealing with ZERO separately | |||
| if (offset > -active && offset < 1) | |||
| b[ -2*offset + 1 ] = ZERO; | |||
| b += active * 2; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| #else | |||
| int offset, i; | |||
| lda *= 2; | |||
| uint32_t sve_size = svcntw(); | |||
| svint32_t posY_vec = svdup_s32(posY); | |||
| svint32_t posX_vec = svdup_s32(posX); | |||
| svint32_t lda_vec = svdup_s32(lda); | |||
| svint32_t one_vec = svdup_s32(1); | |||
| int32_t j = 0; | |||
| int32_t N = n; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| do { | |||
| offset = posX - posY; | |||
| svint32_t vec_off = svdup_s32(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint32_t temp = svadd_z(pg, posX_vec, index); | |||
| svint32_t temp1 = svmul_z(pg, temp, 2); | |||
| temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); | |||
| svint32_t temp2 = svmul_z(pg, temp, lda_vec); | |||
| temp2 = svmla_z(pg, temp2, posY_vec, 2); | |||
| svint32_t gat_ind = svsel(cmp, temp1, temp2); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
| svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | |||
| if (offset <= 0) { | |||
| svbool_t off_g = svwhilelt_b32(offset, 0); | |||
| data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
| } | |||
| svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
| // dealing with ZERO separately | |||
| if (offset > -active && offset < 1) | |||
| b[ -2*offset + 1 ] = ZERO; | |||
| b += active * 2; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,172 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| #if defined(DOUBLE) | |||
| BLASLONG offset, i; | |||
| lda *= 2; | |||
| uint64_t sve_size = svcntd(); | |||
| svint64_t posY_vec = svdup_s64(posY); | |||
| svint64_t posX_vec = svdup_s64(posX); | |||
| svint64_t lda_vec = svdup_s64(lda); | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| do { | |||
| offset = posX - posY; | |||
| svint64_t vec_off = svdup_s64(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint64_t temp = svadd_z(pg, posX_vec, index); | |||
| svint64_t temp1 = svmul_z(pg, temp, lda); | |||
| temp1 = svmla_z(pg, temp1, posY_vec, 2); | |||
| svint64_t temp2 = svmul_z(pg, temp, 2); | |||
| temp2 = svmla_z(pg, temp2, posY_vec, lda); | |||
| svint64_t gat_ind = svsel(cmp, temp1, temp2); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
| svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, 2); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
| data_vec_imag = svneg_z(pg, data_vec_imag); | |||
| if (offset <= 0) { | |||
| svbool_t off_g = svwhilelt_b64(offset, 0LL); | |||
| data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
| } | |||
| svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
| // dealing with ZERO separately | |||
| if (offset > -active && offset < 1) | |||
| b[ -2*offset + 1 ] = ZERO; | |||
| b += active * 2; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| #else | |||
| int offset, i; | |||
| lda *= 2; | |||
| uint32_t sve_size = svcntw(); | |||
| svint32_t posY_vec = svdup_s32(posY); | |||
| svint32_t posX_vec = svdup_s32(posX); | |||
| svint32_t lda_vec = svdup_s32(lda); | |||
| svint32_t one_vec = svdup_s32(1); | |||
| int32_t j = 0; | |||
| int32_t N = n; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| do { | |||
| offset = posX - posY; | |||
| svint32_t vec_off = svdup_s32(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint32_t temp = svadd_z(pg, posX_vec, index); | |||
| svint32_t temp1 = svmul_z(pg, temp, lda); | |||
| temp1 = svmla_z(pg, temp1, posY_vec, 2); | |||
| svint32_t temp2 = svmul_z(pg, temp, 2); | |||
| temp2 = svmla_z(pg, temp2, posY_vec, lda); | |||
| svint32_t gat_ind = svsel(cmp, temp1, temp2); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
| svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, 2); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
| data_vec_imag = svneg_z(pg, data_vec_imag); | |||
| if (offset <= 0) { | |||
| svbool_t off_g = svwhilelt_b32(offset, 0); | |||
| data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
| } | |||
| svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
| // dealing with ZERO separately | |||
| if (offset > -active && offset < 1) | |||
| b[ -2*offset + 1 ] = ZERO; | |||
| b += active * 2; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,150 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, offset; | |||
| lda *= 2; | |||
| #if defined(DOUBLE) | |||
| uint64_t sve_size = svcntd(); | |||
| svint64_t posY_vec = svdup_s64(posY); | |||
| svint64_t posX_vec = svdup_s64(posX); | |||
| svint64_t lda_vec = svdup_s64(lda); | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| do { | |||
| offset = posX - posY; | |||
| svint64_t vec_off = svdup_s64(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint64_t temp = svadd_z(pg, posX_vec, index); | |||
| svint64_t temp1 = svmul_z(pg, temp, 2); | |||
| temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); | |||
| svint64_t temp2 = svmul_z(pg, temp, lda_vec); | |||
| temp2 = svmla_z(pg, temp2, posY_vec, 2); | |||
| svint64_t gat_ind = svsel(cmp, temp1, temp2); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
| svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | |||
| svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
| b += active * 2; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| #else | |||
| uint32_t sve_size = svcntw(); | |||
| svint32_t posY_vec = svdup_s32(posY); | |||
| svint32_t posX_vec = svdup_s32(posX); | |||
| svint32_t lda_vec = svdup_s32(lda); | |||
| svint32_t one_vec = svdup_s32(1); | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| do { | |||
| offset = posX - posY; | |||
| svint32_t vec_off = svdup_s32(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint32_t temp = svadd_z(pg, posX_vec, index); | |||
| svint32_t temp1 = svmul_z(pg, temp, 2); | |||
| temp1 = svmla_z(pg, temp1, posY_vec, lda_vec); | |||
| svint32_t temp2 = svmul_z(pg, temp, lda_vec); | |||
| temp2 = svmla_z(pg, temp2, posY_vec, 2); | |||
| svint32_t gat_ind = svsel(cmp, temp1, temp2); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
| svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | |||
| svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
| b += active * 2; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,150 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, offset; | |||
| lda *= 2; | |||
| #if defined(DOUBLE) | |||
| uint64_t sve_size = svcntd(); | |||
| svint64_t posY_vec = svdup_s64(posY); | |||
| svint64_t posX_vec = svdup_s64(posX); | |||
| svint64_t lda_vec = svdup_s64(lda); | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| do { | |||
| offset = posX - posY; | |||
| svint64_t vec_off = svdup_s64(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint64_t temp = svadd_z(pg, posX_vec, index); | |||
| svint64_t temp1 = svmul_z(pg, temp, lda_vec); | |||
| temp1 = svmla_z(pg, temp1, posY_vec, 2); | |||
| svint64_t temp2 = svmul_z(pg, temp, 2); | |||
| temp2 = svmla_z(pg, temp2, posY_vec, lda); | |||
| svint64_t gat_ind = svsel(cmp, temp1, temp2); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat64_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
| svfloat64_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, 2); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
| svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
| b += active * 2; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| #else | |||
| uint32_t sve_size = svcntw(); | |||
| svint32_t posY_vec = svdup_s32(posY); | |||
| svint32_t posX_vec = svdup_s32(posX); | |||
| svint32_t lda_vec = svdup_s32(lda); | |||
| svint32_t one_vec = svdup_s32(1); | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| do { | |||
| offset = posX - posY; | |||
| svint32_t vec_off = svdup_s32(offset); | |||
| svbool_t cmp = svcmpgt(pg, vec_off, index_neg); | |||
| svint32_t temp = svadd_z(pg, posX_vec, index); | |||
| svint32_t temp1 = svmul_z(pg, temp, lda_vec); | |||
| temp1 = svmla_z(pg, temp1, posY_vec, 2); | |||
| svint32_t temp2 = svmul_z(pg, temp, 2); | |||
| temp2 = svmla_z(pg, temp2, posY_vec, lda); | |||
| svint32_t gat_ind = svsel(cmp, temp1, temp2); | |||
| i = m; | |||
| while (i>0) { | |||
| svfloat32_t data_vec_real = svld1_gather_index(pg, a, gat_ind); | |||
| svfloat32_t data_vec_imag = svld1_gather_index(pg, a+1, gat_ind); | |||
| gat_ind = svadd_m(cmp, gat_ind, 2); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
| svst2(pg, b, svcreate2(data_vec_real, data_vec_imag)); | |||
| b += active * 2; | |||
| offset --; | |||
| vec_off = svsub_z(pg, vec_off, one_vec); | |||
| cmp = svcmpgt(pg, vec_off, index_neg); | |||
| i--; | |||
| } | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,145 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef __ARM_FEATURE_SVE | |||
| #include <arm_sve.h> | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| lda += lda; | |||
| js = 0; | |||
| FLOAT *ao; | |||
| #ifdef DOUBLE | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao = a + posY * 2 + posX * lda; | |||
| } else { | |||
| ao = a + posX * 2 + posY * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X > posY) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
| svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
| #else | |||
| svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
| svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
| #endif | |||
| svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | |||
| ao += 2; | |||
| b += n_active * 2; | |||
| X ++; | |||
| i ++; | |||
| } else | |||
| if (X < posY) { | |||
| ao += lda; | |||
| b += n_active * 2; | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = *(ao+k*lda+j*2); | |||
| b[temp++] = *(ao+k*lda+j*2+1); | |||
| } | |||
| b[temp++] = ONE; | |||
| b[temp++] = ZERO; | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = ZERO; | |||
| b[temp++] = ZERO; | |||
| } | |||
| } | |||
| #else | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k <= j; k++) { | |||
| b[temp++] = *(ao+k*lda+j*2); | |||
| b[temp++] = *(ao+k*lda+j*2+1); | |||
| } | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = ZERO; | |||
| b[temp++] = ZERO; | |||
| } | |||
| } | |||
| #endif | |||
| ao += n_active * 2; | |||
| b += n_active*n_active * 2; | |||
| X += n_active; | |||
| i += n_active; | |||
| } | |||
| } while (i < m); | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,143 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef __ARM_FEATURE_SVE | |||
| #include <arm_sve.h> | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| lda += lda; | |||
| FLOAT *ao; | |||
| js = 0; | |||
| #ifdef DOUBLE | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao = a + posY * 2 + posX * lda; | |||
| } else { | |||
| ao = a + posX * 2 + posY * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X > posY) { | |||
| ao += 2; | |||
| b += n_active * 2; | |||
| X ++; | |||
| i ++; | |||
| } else | |||
| if (X < posY) { | |||
| #ifdef DOUBLE | |||
| svfloat64x2_t aj_vec = svld2(pn, ao); | |||
| #else | |||
| svfloat32x2_t aj_vec = svld2(pn, ao); | |||
| #endif | |||
| svst2(pn, b, aj_vec); | |||
| ao += lda; | |||
| b += n_active * 2; | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = ZERO; | |||
| b[temp++] = ZERO; | |||
| } | |||
| b[temp++] = ONE; | |||
| b[temp++] = ZERO; | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = *(ao+j*lda+k*2); | |||
| b[temp++] = *(ao+j*lda+k*2+1); | |||
| } | |||
| } | |||
| #else | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = ZERO; | |||
| b[temp++] = ZERO; | |||
| } | |||
| for (int k = j; k < n_active; k++) { | |||
| b[temp++] = *(ao+j*lda+k*2); | |||
| b[temp++] = *(ao+j*lda+k*2+1); | |||
| } | |||
| } | |||
| #endif | |||
| ao += n_active * lda; | |||
| b += n_active*n_active * 2; | |||
| X += n_active; | |||
| i += n_active; | |||
| } | |||
| } while (i < m); | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,145 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef __ARM_FEATURE_SVE | |||
| #include <arm_sve.h> | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| lda += lda; | |||
| js = 0; | |||
| FLOAT *ao; | |||
| #ifdef DOUBLE | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao = a + posX * 2 + posY * lda; | |||
| } else { | |||
| ao = a + posY * 2 + posX * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X < posY) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
| svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
| #else | |||
| svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
| svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
| #endif | |||
| svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | |||
| ao += 2; | |||
| b += n_active * 2; | |||
| X ++; | |||
| i ++; | |||
| } else | |||
| if (X > posY) { | |||
| ao += lda; | |||
| b += n_active * 2; | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = ZERO; | |||
| b[temp++] = ZERO; | |||
| } | |||
| b[temp++] = ONE; | |||
| b[temp++] = ZERO; | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = *(ao+k*lda+j*2); | |||
| b[temp++] = *(ao+k*lda+j*2+1); | |||
| } | |||
| } | |||
| #else | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = ZERO; | |||
| b[temp++] = ZERO; | |||
| } | |||
| for (int k = j; k < n_active; k++) { | |||
| b[temp++] = *(ao+k*lda+j*2); | |||
| b[temp++] = *(ao+k*lda+j*2+1); | |||
| } | |||
| } | |||
| #endif | |||
| ao += n_active * 2; | |||
| b += n_active*n_active * 2; | |||
| X += n_active; | |||
| i += n_active; | |||
| } | |||
| } while (i < m); | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,141 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef __ARM_FEATURE_SVE | |||
| #include <arm_sve.h> | |||
| #endif | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){ | |||
| BLASLONG i, js; | |||
| BLASLONG X; | |||
| lda += lda; | |||
| FLOAT *ao; | |||
| js = 0; | |||
| #ifdef DOUBLE | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| { | |||
| X = posX; | |||
| if (posX <= posY) { | |||
| ao = a + posX * 2 + posY * lda; | |||
| } else { | |||
| ao = a + posY * 2 + posX * lda; | |||
| } | |||
| i = 0; | |||
| do | |||
| { | |||
| if (X < posY) { | |||
| ao += 2; | |||
| b += n_active * 2; | |||
| X ++; | |||
| i ++; | |||
| } else | |||
| if (X > posY) { | |||
| #ifdef DOUBLE | |||
| svfloat64x2_t aj_vec = svld2(pn, ao); | |||
| #else | |||
| svfloat32x2_t aj_vec = svld2(pn, ao); | |||
| #endif | |||
| svst2(pn, b, aj_vec); | |||
| ao += lda; | |||
| b += n_active * 2; | |||
| X ++; | |||
| i ++; | |||
| } else { | |||
| /* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||
| #ifdef UNIT | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k < j; k++) { | |||
| b[temp++] = *(ao+j*lda+k*2); | |||
| b[temp++] = *(ao+j*lda+k*2+1); | |||
| } | |||
| b[temp++] = ONE; | |||
| b[temp++] = ZERO; | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = ZERO; | |||
| b[temp++] = ZERO; | |||
| } | |||
| } | |||
| #else | |||
| int temp = 0; | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0 ; k <= j; k++) { | |||
| b[temp++] = *(ao+j*lda+k*2); | |||
| b[temp++] = *(ao+j*lda+k*2+1); | |||
| } | |||
| for (int k = j+1; k < n_active; k++) { | |||
| b[temp++] = ZERO; | |||
| b[temp++] = ZERO; | |||
| } | |||
| } | |||
| #endif | |||
| ao += n_active * lda; | |||
| b += n_active*n_active * 2; | |||
| X += n_active; | |||
| i += n_active; | |||
| } | |||
| } while (i < m); | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,119 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj; | |||
| FLOAT *ao; | |||
| lda *= 2; | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| ao = a; | |||
| i = 0; | |||
| ii = 0; | |||
| do { | |||
| if (ii == jj) { | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0; k < j; k++) { | |||
| *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); | |||
| *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); | |||
| } | |||
| compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||
| //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
| } | |||
| ao += n_active * 2; | |||
| b += n_active * n_active * 2; | |||
| i += n_active; | |||
| ii += n_active; | |||
| } else { | |||
| if (ii > jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
| svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
| #else | |||
| svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
| svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
| #endif | |||
| svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | |||
| } | |||
| ao += 2; | |||
| b += n_active * 2; | |||
| i++; | |||
| ii++; | |||
| } | |||
| } while (i < m); | |||
| a += n_active * lda; | |||
| jj += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,115 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj; | |||
| FLOAT *ao; | |||
| lda *= 2; | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| ao = a; | |||
| i = 0; | |||
| ii = 0; | |||
| do { | |||
| if (ii == jj) { | |||
| for (int j = 0; j < n_active; j++) { | |||
| compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||
| //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
| for (int k = j+1; k < n_active; k++) { | |||
| *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); | |||
| *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); | |||
| } | |||
| } | |||
| b += n_active * n_active * 2; | |||
| ao += lda * n_active; | |||
| i += n_active; | |||
| ii += n_active; | |||
| } else { | |||
| if (ii < jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64x2_t aj_vec = svld2(pn, ao); | |||
| #else | |||
| svfloat32x2_t aj_vec = svld2(pn, ao); | |||
| #endif | |||
| svst2(pn, b, aj_vec); | |||
| } | |||
| ao += lda; | |||
| b += n_active * 2; | |||
| i ++; | |||
| ii ++; | |||
| } | |||
| } while (i < m); | |||
| a += n_active * 2; | |||
| jj += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,119 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj; | |||
| FLOAT *ao; | |||
| lda *= 2; | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| ao = a; | |||
| i = 0; | |||
| ii = 0; | |||
| do { | |||
| if (ii == jj) { | |||
| for (int j = 0; j < n_active; j++) { | |||
| compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||
| //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
| for (int k = j+1; k < n_active; k++) { | |||
| *(b + 2*j * n_active + 2*k) = *(ao + k * lda + 2*j); | |||
| *(b + 2*j * n_active + 2*k + 1) = *(ao + k * lda + 2*j + 1); | |||
| } | |||
| } | |||
| ao += n_active * 2; | |||
| b += n_active * n_active * 2; | |||
| i += n_active; | |||
| ii += n_active; | |||
| } else { | |||
| if (ii < jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
| svfloat64_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
| #else | |||
| svfloat32_t aj_vec_real = svld1_gather_index(pn, ao, index); | |||
| svfloat32_t aj_vec_imag = svld1_gather_index(pn, ao+1, index); | |||
| #endif | |||
| svst2(pn, b, svcreate2(aj_vec_real, aj_vec_imag)); | |||
| } | |||
| ao += 2; | |||
| b += n_active * 2; | |||
| i++; | |||
| ii++; | |||
| } | |||
| } while (i < m); | |||
| a += n_active * lda; | |||
| jj += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,115 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include "arm_sve.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){ | |||
| BLASLONG i, ii, jj; | |||
| FLOAT *ao; | |||
| lda *= 2; | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| ao = a; | |||
| i = 0; | |||
| ii = 0; | |||
| do { | |||
| if (ii == jj) { | |||
| for (int j = 0; j < n_active; j++) { | |||
| for (int k = 0; k < j; k++) { | |||
| *(b + 2*j * n_active + 2*k) = *(ao + j * lda + 2*k); | |||
| *(b + 2*j * n_active + 2*k + 1) = *(ao + j * lda + 2*k + 1); | |||
| } | |||
| compinv(b + 2*j * n_active + 2*j, *(ao + j * lda + 2*j), *(ao + j * lda + 2*j+1)); | |||
| //*(b + j * n_active + j) = INV(*(ao + j * lda + j)); | |||
| } | |||
| ao += lda * n_active; | |||
| b += n_active * n_active * 2; | |||
| i += n_active; | |||
| ii += n_active; | |||
| } else { | |||
| if (ii > jj) { | |||
| #ifdef DOUBLE | |||
| svfloat64x2_t aj_vec = svld2(pn, ao); | |||
| #else | |||
| svfloat32x2_t aj_vec = svld2(pn, ao); | |||
| #endif | |||
| svst2(pn, b, aj_vec); | |||
| } | |||
| ao += lda; | |||
| b += n_active * 2; | |||
| i ++; | |||
| ii ++; | |||
| } | |||
| } while (i < m); | |||
| a += n_active * 2; | |||
| jj += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,149 @@ | |||
| SAMAXKERNEL = ../arm/amax.c | |||
| DAMAXKERNEL = ../arm/amax.c | |||
| CAMAXKERNEL = ../arm/zamax.c | |||
| ZAMAXKERNEL = ../arm/zamax.c | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMAXKERNEL = ../arm/iamax.c | |||
| IDAMAXKERNEL = ../arm/iamax.c | |||
| ICAMAXKERNEL = ../arm/izamax.c | |||
| IZAMAXKERNEL = ../arm/izamax.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| SASUMKERNEL = ../arm/asum.c | |||
| DASUMKERNEL = ../arm/asum.c | |||
| CASUMKERNEL = ../arm/zasum.c | |||
| ZASUMKERNEL = ../arm/zasum.c | |||
| SSUMKERNEL = ../arm/sum.c | |||
| DSUMKERNEL = ../arm/sum.c | |||
| CSUMKERNEL = ../arm/zsum.c | |||
| ZSUMKERNEL = ../arm/zsum.c | |||
| SAXPYKERNEL = ../arm/axpy.c | |||
| DAXPYKERNEL = ../arm/axpy.c | |||
| CAXPYKERNEL = ../arm/zaxpy.c | |||
| ZAXPYKERNEL = ../arm/zaxpy.c | |||
| SCOPYKERNEL = ../arm/copy.c | |||
| DCOPYKERNEL = ../arm/copy.c | |||
| CCOPYKERNEL = ../arm/zcopy.c | |||
| ZCOPYKERNEL = ../arm/zcopy.c | |||
| SDOTKERNEL = ../arm/dot.c | |||
| DDOTKERNEL = ../arm/dot.c | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| DSDOTKERNEL = ../generic/dot.c | |||
| SNRM2KERNEL = ../arm/nrm2.c | |||
| DNRM2KERNEL = ../arm/nrm2.c | |||
| CNRM2KERNEL = ../arm/znrm2.c | |||
| ZNRM2KERNEL = ../arm/znrm2.c | |||
| SROTKERNEL = ../arm/rot.c | |||
| DROTKERNEL = ../arm/rot.c | |||
| CROTKERNEL = ../arm/zrot.c | |||
| ZROTKERNEL = ../arm/zrot.c | |||
| SSCALKERNEL = ../arm/scal.c | |||
| DSCALKERNEL = ../arm/scal.c | |||
| CSCALKERNEL = ../arm/zscal.c | |||
| ZSCALKERNEL = ../arm/zscal.c | |||
| SSWAPKERNEL = ../arm/swap.c | |||
| DSWAPKERNEL = ../arm/swap.c | |||
| CSWAPKERNEL = ../arm/zswap.c | |||
| ZSWAPKERNEL = ../arm/zswap.c | |||
| SGEMVNKERNEL = ../arm/gemv_n.c | |||
| DGEMVNKERNEL = ../arm/gemv_n.c | |||
| CGEMVNKERNEL = ../arm/zgemv_n.c | |||
| ZGEMVNKERNEL = ../arm/zgemv_n.c | |||
| SGEMVTKERNEL = ../arm/gemv_t.c | |||
| DGEMVTKERNEL = ../arm/gemv_t.c | |||
| CGEMVTKERNEL = ../arm/zgemv_t.c | |||
| ZGEMVTKERNEL = ../arm/zgemv_t.c | |||
| STRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| SGEMM_BETA = ../generic/gemm_beta.c | |||
| DGEMM_BETA = ../generic/gemm_beta.c | |||
| CGEMM_BETA = ../generic/zgemm_beta.c | |||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| @@ -0,0 +1 @@ | |||
| clean :: | |||
| @@ -1 +1,14 @@ | |||
| #TODO: Add loongarch64 SIMD optimizations | |||
| DGEMMKERNEL = dgemm_kernel_16x4.S | |||
| DGEMMINCOPY = dgemm_ncopy_16.S | |||
| DGEMMITCOPY = dgemm_tcopy_16.S | |||
| DGEMMONCOPY = dgemm_ncopy_4.S | |||
| DGEMMOTCOPY = dgemm_tcopy_4.S | |||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| @@ -0,0 +1,691 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define S9 $r20 | |||
| #define S10 $r23 | |||
| #define S11 $r24 | |||
| #define S12 $r25 | |||
| #define S13 $r26 | |||
| #define S14 $r27 | |||
| #define S15 $r28 | |||
| #define S16 $r29 | |||
| #define TD $r30 | |||
| #define TS $r31 | |||
| #define TL $r7 | |||
| #define T0 $r6 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| #define U4 $xr4 | |||
| #define U5 $xr5 | |||
| #define U6 $xr6 | |||
| #define U7 $xr7 | |||
| #define U8 $xr8 | |||
| #define U9 $xr9 | |||
| #define U10 $xr10 | |||
| #define U11 $xr11 | |||
| #define U12 $xr12 | |||
| #define U13 $xr13 | |||
| #define U14 $xr14 | |||
| #define U15 $xr15 | |||
| #define D0 $xr16 | |||
| #define D1 $xr17 | |||
| #define D2 $xr18 | |||
| #define D3 $xr19 | |||
| #define D4 $xr20 | |||
| #define D5 $xr21 | |||
| #define D6 $xr22 | |||
| #define D7 $xr23 | |||
| #define D8 $xr24 | |||
| #define D9 $xr25 | |||
| #define D10 $xr26 | |||
| #define D11 $xr27 | |||
| #define D12 $xr28 | |||
| #define D13 $xr29 | |||
| #define D14 $xr30 | |||
| #define D15 $xr31 | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -0x90 | |||
| SDARG $r23, $sp, 0x00 | |||
| SDARG $r24, $sp, 0x08 | |||
| SDARG $r25, $sp, 0x10 | |||
| SDARG $r26, $sp, 0x18 | |||
| SDARG $r27, $sp, 0x20 | |||
| SDARG $r28, $sp, 0x28 | |||
| SDARG $r29, $sp, 0x30 | |||
| SDARG $r30, $sp, 0x38 | |||
| SDARG $r31, $sp, 0x40 | |||
| ST $f23, $sp, 0x48 | |||
| ST $f24, $sp, 0x50 | |||
| ST $f25, $sp, 0x58 | |||
| ST $f26, $sp, 0x60 | |||
| ST $f27, $sp, 0x68 | |||
| ST $f28, $sp, 0x70 | |||
| ST $f29, $sp, 0x78 | |||
| ST $f30, $sp, 0x80 | |||
| ST $f31, $sp, 0x88 | |||
| move TD, DST | |||
| move TS, SRC | |||
| slli.d TL, LDA, 0x03 | |||
| slli.d T0, TL, 0x01 | |||
| srai.d J, N, 0x04 | |||
| beq J, ZERO, .L_N8 | |||
| .L_J1: /* J-- */ | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x03 | |||
| add.d S3, S2, TL | |||
| addi.d J, J, -1 | |||
| add.d S4, S3, TL | |||
| add.d S5, S3, T0 | |||
| add.d S6, S4, T0 | |||
| add.d S7, S5, T0 | |||
| add.d S8, S6, T0 | |||
| add.d S9, S7, T0 | |||
| add.d S10, S8, T0 | |||
| add.d S11, S9, T0 | |||
| add.d S12, S10, T0 | |||
| add.d S13, S11, T0 | |||
| add.d S14, S12, T0 | |||
| add.d S15, S13, T0 | |||
| add.d S16, S14, T0 | |||
| add.d TS, S15, T0 | |||
| beq I, ZERO, .L_I7 | |||
| .L_I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| xvld U8, S9, 0x00 | |||
| xvld U9, S10, 0x00 | |||
| xvld U10, S11, 0x00 | |||
| xvld U11, S12, 0x00 | |||
| xvld U12, S13, 0x00 | |||
| xvld U13, S14, 0x00 | |||
| xvld U14, S15, 0x00 | |||
| xvld U15, S16, 0x00 | |||
| xvpackev.d D0, U1, U0 | |||
| xvpackod.d D1, U1, U0 | |||
| xvpackev.d D2, U3, U2 | |||
| xvpackod.d D3, U3, U2 | |||
| xvpackev.d D4, U5, U4 | |||
| xvpackod.d D5, U5, U4 | |||
| xvpackev.d D6, U7, U6 | |||
| xvpackod.d D7, U7, U6 | |||
| xvpackev.d D8, U9, U8 | |||
| xvpackod.d D9, U9, U8 | |||
| xvpackev.d D10, U11, U10 | |||
| xvpackod.d D11, U11, U10 | |||
| xvpackev.d D12, U13, U12 | |||
| xvpackod.d D13, U13, U12 | |||
| xvpackev.d D14, U15, U14 | |||
| xvpackod.d D15, U15, U14 | |||
| xvand.v U0, D0, D0 | |||
| xvpermi.q D0, D2, 0x02 // 0 | |||
| xvand.v U4, D4, D4 | |||
| xvpermi.q D4, D6, 0x02 // 1 | |||
| xvand.v U1, D1, D1 | |||
| xvpermi.q D1, D3, 0x02 // 4 | |||
| xvand.v U5, D5, D5 | |||
| xvpermi.q D5, D7, 0x02 // 5 | |||
| xvpermi.q D2, U0, 0x31 // 8 | |||
| xvpermi.q D6, U4, 0x31 // 9 | |||
| xvpermi.q D3, U1, 0x31 // 12 | |||
| xvpermi.q D7, U5, 0x31 // 13 | |||
| xvand.v U8, D8, D8 | |||
| xvpermi.q D8, D10, 0x02 // 2 | |||
| xvand.v U12, D12, D12 | |||
| xvpermi.q D12, D14, 0x02 // 3 | |||
| xvand.v U9, D9, D9 | |||
| xvpermi.q D9, D11, 0x02 // 6 | |||
| xvand.v U13, D13, D13 | |||
| xvpermi.q D13, D15, 0x02 // 7 | |||
| xvpermi.q D10, U8, 0x31 // 10 | |||
| xvpermi.q D14, U12, 0x31 // 11 | |||
| xvpermi.q D11, U9, 0x31 // 14 | |||
| xvpermi.q D15, U13, 0x31 // 15 | |||
| xvst D0, TD, 0x00 // 0 | |||
| xvst D4, TD, 0x20 // 1 | |||
| xvst D8, TD, 0x40 // 2 | |||
| xvst D12, TD, 0x60 // 3 | |||
| xvst D1, TD, 0x80 // 4 | |||
| xvst D5, TD, 0xA0 // 5 | |||
| xvst D9, TD, 0xC0 // 6 | |||
| xvst D13, TD, 0xE0 // 7 | |||
| addi.d TD, TD, 0x100 | |||
| xvst D2, TD, 0x00 // 8 | |||
| xvst D6, TD, 0x20 // 9 | |||
| xvst D10, TD, 0x40 // 10 | |||
| xvst D14, TD, 0x60 // 11 | |||
| xvst D3, TD, 0x80 // 12 | |||
| xvst D7, TD, 0xA0 // 13 | |||
| xvst D11, TD, 0xC0 // 14 | |||
| xvst D15, TD, 0xE0 // 15 | |||
| addi.d TD, TD, 0x100 | |||
| xvld U0, S1, 0x20 | |||
| xvld U1, S2, 0x20 | |||
| xvld U2, S3, 0x20 | |||
| xvld U3, S4, 0x20 | |||
| xvld U4, S5, 0x20 | |||
| xvld U5, S6, 0x20 | |||
| xvld U6, S7, 0x20 | |||
| xvld U7, S8, 0x20 | |||
| xvld U8, S9, 0x20 | |||
| xvld U9, S10, 0x20 | |||
| xvld U10, S11, 0x20 | |||
| xvld U11, S12, 0x20 | |||
| xvld U12, S13, 0x20 | |||
| xvld U13, S14, 0x20 | |||
| xvld U14, S15, 0x20 | |||
| xvld U15, S16, 0x20 | |||
| xvpackev.d D0, U1, U0 | |||
| xvpackod.d D1, U1, U0 | |||
| xvpackev.d D2, U3, U2 | |||
| xvpackod.d D3, U3, U2 | |||
| xvpackev.d D4, U5, U4 | |||
| xvpackod.d D5, U5, U4 | |||
| xvpackev.d D6, U7, U6 | |||
| xvpackod.d D7, U7, U6 | |||
| xvpackev.d D8, U9, U8 | |||
| xvpackod.d D9, U9, U8 | |||
| xvpackev.d D10, U11, U10 | |||
| xvpackod.d D11, U11, U10 | |||
| xvpackev.d D12, U13, U12 | |||
| xvpackod.d D13, U13, U12 | |||
| xvpackev.d D14, U15, U14 | |||
| xvpackod.d D15, U15, U14 | |||
| xvand.v U0, D0, D0 | |||
| xvpermi.q D0, D2, 0x02 // 0 | |||
| xvand.v U4, D4, D4 | |||
| xvpermi.q D4, D6, 0x02 // 1 | |||
| xvand.v U1, D1, D1 | |||
| xvpermi.q D1, D3, 0x02 // 4 | |||
| xvand.v U5, D5, D5 | |||
| xvpermi.q D5, D7, 0x02 // 5 | |||
| xvpermi.q D2, U0, 0x31 // 8 | |||
| xvpermi.q D6, U4, 0x31 // 9 | |||
| xvpermi.q D3, U1, 0x31 // 12 | |||
| xvpermi.q D7, U5, 0x31 // 13 | |||
| xvand.v U8, D8, D8 | |||
| xvpermi.q D8, D10, 0x02 // 2 | |||
| xvand.v U12, D12, D12 | |||
| xvpermi.q D12, D14, 0x02 // 3 | |||
| xvand.v U9, D9, D9 | |||
| xvpermi.q D9, D11, 0x02 // 6 | |||
| xvand.v U13, D13, D13 | |||
| xvpermi.q D13, D15, 0x02 // 7 | |||
| xvpermi.q D10, U8, 0x31 // 10 | |||
| xvpermi.q D14, U12, 0x31 // 11 | |||
| xvpermi.q D11, U9, 0x31 // 14 | |||
| xvpermi.q D15, U13, 0x31 // 15 | |||
| xvst D0, TD, 0x00 // 0 | |||
| xvst D4, TD, 0x20 // 1 | |||
| xvst D8, TD, 0x40 // 2 | |||
| xvst D12, TD, 0x60 // 3 | |||
| xvst D1, TD, 0x80 // 4 | |||
| xvst D5, TD, 0xA0 // 5 | |||
| xvst D9, TD, 0xC0 // 6 | |||
| xvst D13, TD, 0xE0 // 7 | |||
| addi.d TD, TD, 0x100 | |||
| xvst D2, TD, 0x00 // 8 | |||
| xvst D6, TD, 0x20 // 9 | |||
| xvst D10, TD, 0x40 // 10 | |||
| xvst D14, TD, 0x60 // 11 | |||
| xvst D3, TD, 0x80 // 12 | |||
| xvst D7, TD, 0xA0 // 13 | |||
| xvst D11, TD, 0xC0 // 14 | |||
| xvst D15, TD, 0xE0 // 15 | |||
| addi.d TD, TD, 0x100 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d S2, S2, 0x40 | |||
| addi.d S3, S3, 0x40 | |||
| addi.d S4, S4, 0x40 | |||
| addi.d S5, S5, 0x40 | |||
| addi.d S6, S6, 0x40 | |||
| addi.d S7, S7, 0x40 | |||
| addi.d S8, S8, 0x40 | |||
| addi.d S9, S9, 0x40 | |||
| addi.d S10, S10, 0x40 | |||
| addi.d S11, S11, 0x40 | |||
| addi.d S12, S12, 0x40 | |||
| addi.d S13, S13, 0x40 | |||
| addi.d S14, S14, 0x40 | |||
| addi.d S15, S15, 0x40 | |||
| addi.d S16, S16, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_I1 | |||
| .L_I7: | |||
| andi I, M, 0x07 | |||
| beq I, ZERO, .L_I0 | |||
| .L_II1: /* I-- */ | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fld.d F4, S5, 0x00 | |||
| fld.d F5, S6, 0x00 | |||
| fld.d F6, S7, 0x00 | |||
| fld.d F7, S8, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| addi.d S3, S3, 0x08 | |||
| fst.d F3, TD, 0x18 | |||
| addi.d S4, S4, 0x08 | |||
| fst.d F4, TD, 0x20 | |||
| addi.d S5, S5, 0x08 | |||
| fst.d F5, TD, 0x28 | |||
| addi.d S6, S6, 0x08 | |||
| fst.d F6, TD, 0x30 | |||
| addi.d S7, S7, 0x08 | |||
| fst.d F7, TD, 0x38 | |||
| addi.d S8, S8, 0x08 | |||
| addi.d TD, TD, 0x40 | |||
| fld.d F0, S9, 0x00 | |||
| fld.d F1, S10, 0x00 | |||
| fld.d F2, S11, 0x00 | |||
| fld.d F3, S12, 0x00 | |||
| fld.d F4, S13, 0x00 | |||
| fld.d F5, S14, 0x00 | |||
| fld.d F6, S15, 0x00 | |||
| fld.d F7, S16, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d S9, S9, 0x08 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S10, S10, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| addi.d S11, S11, 0x08 | |||
| fst.d F3, TD, 0x18 | |||
| addi.d S12, S12, 0x08 | |||
| fst.d F4, TD, 0x20 | |||
| addi.d S13, S13, 0x08 | |||
| fst.d F5, TD, 0x28 | |||
| addi.d S14, S14, 0x08 | |||
| fst.d F6, TD, 0x30 | |||
| addi.d S15, S15, 0x08 | |||
| fst.d F7, TD, 0x38 | |||
| addi.d S16, S16, 0x08 | |||
| addi.d TD, TD, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_II1 | |||
| .L_I0: | |||
| blt ZERO, J, .L_J1 | |||
| .L_N8: | |||
| andi J, N, 0x08 | |||
| beq ZERO, J, .L_N4 | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x03 | |||
| add.d S3, S2, TL | |||
| add.d S4, S2, T0 | |||
| add.d S5, S3, T0 | |||
| add.d S6, S4, T0 | |||
| add.d S7, S5, T0 | |||
| add.d S8, S6, T0 | |||
| add.d TS, S7, T0 | |||
| beq I, ZERO, .L_8I3 | |||
| .L_8I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| xvpackev.d D0, U1, U0 | |||
| xvpackod.d D1, U1, U0 | |||
| xvpackev.d D2, U3, U2 | |||
| xvpackod.d D3, U3, U2 | |||
| xvpackev.d D4, U5, U4 | |||
| xvpackod.d D5, U5, U4 | |||
| xvpackev.d D6, U7, U6 | |||
| xvpackod.d D7, U7, U6 | |||
| xvand.v U0, D0, D0 | |||
| xvpermi.q D0, D2, 0x02 // 0 | |||
| xvand.v U4, D4, D4 | |||
| xvpermi.q D4, D6, 0x02 // 1 | |||
| xvand.v U1, D1, D1 | |||
| xvpermi.q D1, D3, 0x02 // 2 | |||
| xvand.v U5, D5, D5 | |||
| xvpermi.q D5, D7, 0x02 // 3 | |||
| xvpermi.q D2, U0, 0x31 // 4 | |||
| xvpermi.q D6, U4, 0x31 // 5 | |||
| xvpermi.q D3, U1, 0x31 // 6 | |||
| xvpermi.q D7, U5, 0x31 // 7 | |||
| xvst D0, TD, 0x00 | |||
| xvst D4, TD, 0x20 | |||
| xvst D1, TD, 0x40 | |||
| xvst D5, TD, 0x60 | |||
| xvst D2, TD, 0x80 | |||
| xvst D6, TD, 0xA0 | |||
| xvst D3, TD, 0xC0 | |||
| xvst D7, TD, 0xE0 | |||
| addi.d TD, TD, 0x100 | |||
| xvld U0, S1, 0x20 | |||
| xvld U1, S2, 0x20 | |||
| xvld U2, S3, 0x20 | |||
| xvld U3, S4, 0x20 | |||
| xvld U4, S5, 0x20 | |||
| xvld U5, S6, 0x20 | |||
| xvld U6, S7, 0x20 | |||
| xvld U7, S8, 0x20 | |||
| xvpackev.d D0, U1, U0 | |||
| xvpackod.d D1, U1, U0 | |||
| xvpackev.d D2, U3, U2 | |||
| xvpackod.d D3, U3, U2 | |||
| xvpackev.d D4, U5, U4 | |||
| xvpackod.d D5, U5, U4 | |||
| xvpackev.d D6, U7, U6 | |||
| xvpackod.d D7, U7, U6 | |||
| xvand.v U0, D0, D0 | |||
| xvpermi.q D0, D2, 0x02 // 0 | |||
| xvand.v U4, D4, D4 | |||
| xvpermi.q D4, D6, 0x02 // 1 | |||
| xvand.v U1, D1, D1 | |||
| xvpermi.q D1, D3, 0x02 // 2 | |||
| xvand.v U5, D5, D5 | |||
| xvpermi.q D5, D7, 0x02 // 3 | |||
| xvpermi.q D2, U0, 0x31 // 4 | |||
| xvpermi.q D6, U4, 0x31 // 5 | |||
| xvpermi.q D3, U1, 0x31 // 6 | |||
| xvpermi.q D7, U5, 0x31 // 7 | |||
| xvst D0, TD, 0x00 | |||
| xvst D4, TD, 0x20 | |||
| xvst D1, TD, 0x40 | |||
| xvst D5, TD, 0x60 | |||
| xvst D2, TD, 0x80 | |||
| xvst D6, TD, 0xA0 | |||
| xvst D3, TD, 0xC0 | |||
| xvst D7, TD, 0xE0 | |||
| addi.d TD, TD, 0x100 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d S2, S2, 0x40 | |||
| addi.d S3, S3, 0x40 | |||
| addi.d S4, S4, 0x40 | |||
| addi.d S5, S5, 0x40 | |||
| addi.d S6, S6, 0x40 | |||
| addi.d S7, S7, 0x40 | |||
| addi.d S8, S8, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_8I1 | |||
| .L_8I3: | |||
| andi I, M, 0x07 | |||
| beq I, ZERO, .L_N4 | |||
| .L_8I11: | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fld.d F4, S5, 0x00 | |||
| fld.d F5, S6, 0x00 | |||
| fld.d F6, S7, 0x00 | |||
| fld.d F7, S8, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| addi.d S3, S3, 0x08 | |||
| fst.d F3, TD, 0x18 | |||
| addi.d S4, S4, 0x08 | |||
| fst.d F4, TD, 0x20 | |||
| addi.d S5, S5, 0x08 | |||
| fst.d F5, TD, 0x28 | |||
| addi.d S6, S6, 0x08 | |||
| fst.d F6, TD, 0x30 | |||
| addi.d S7, S7, 0x08 | |||
| fst.d F7, TD, 0x38 | |||
| addi.d S8, S8, 0x08 | |||
| addi.d TD, TD, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_8I11 | |||
| .L_N4: | |||
| andi J, N, 0x04 | |||
| beq ZERO, J, .L_N2 | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x02 | |||
| add.d S3, S2, TL | |||
| add.d S4, S2, T0 | |||
| add.d TS, S3, T0 | |||
| beq I, ZERO, .L_I3 | |||
| .L_4I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvpackev.d D0, U1, U0 | |||
| xvpackod.d D1, U1, U0 | |||
| xvpackev.d D2, U3, U2 | |||
| xvpackod.d D3, U3, U2 | |||
| xvand.v U0, D0, D0 | |||
| xvpermi.q D0, D2, 0x02 // 0 | |||
| xvand.v U1, D1, D1 | |||
| xvpermi.q D1, D3, 0x02 // 1 | |||
| xvpermi.q D2, U0, 0x31 // 2 | |||
| xvpermi.q D3, U1, 0x31 // 3 | |||
| xvst D0, TD, 0x00 | |||
| xvst D1, TD, 0x20 | |||
| xvst D2, TD, 0x40 | |||
| xvst D3, TD, 0x60 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| addi.d TD, TD, 0x80 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_4I1 | |||
| .L_I3: | |||
| andi I, M, 0x03 | |||
| beq I, ZERO, .L_N2 | |||
| .L_4II1: | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| addi.d S3, S3, 0x08 | |||
| fst.d F3, TD, 0x18 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d TD, TD, 0x20 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_4II1 | |||
| .L_N2: | |||
| andi J, N, 0x02 | |||
| beq ZERO, J, .L_N1 | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x01 | |||
| add.d TS, S2, TL | |||
| beq I, ZERO, .L_NI1 | |||
| .L_2I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvpackev.d D0, U1, U0 | |||
| xvpackod.d D1, U1, U0 | |||
| xvpermi.q D0, D1, 0x02 // 0 | |||
| xvst D0, TD, 0x00 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d TD, TD, 0x20 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_2I1 | |||
| .L_NI1: | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_N1 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d TD, TD, 0x10 | |||
| .L_N1: | |||
| move S1, TS | |||
| beq ZERO, M, .L_N0 | |||
| .L_M1: | |||
| fld.d F0, S1, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d TD, TD, 0x08 | |||
| addi.d M, M, -1 | |||
| blt ZERO, M, .L_M1 | |||
| .L_N0: | |||
| LDARG $r23, $sp, 0x00 | |||
| LDARG $r24, $sp, 0x08 | |||
| LDARG $r25, $sp, 0x10 | |||
| LDARG $r26, $sp, 0x18 | |||
| LDARG $r27, $sp, 0x20 | |||
| LDARG $r28, $sp, 0x28 | |||
| LDARG $r29, $sp, 0x30 | |||
| LDARG $r30, $sp, 0x38 | |||
| LDARG $r31, $sp, 0x40 | |||
| LD $f23, $sp, 0x48 | |||
| LD $f24, $sp, 0x50 | |||
| LD $f25, $sp, 0x58 | |||
| LD $f26, $sp, 0x60 | |||
| LD $f27, $sp, 0x68 | |||
| LD $f28, $sp, 0x70 | |||
| LD $f29, $sp, 0x78 | |||
| LD $f30, $sp, 0x80 | |||
| LD $f31, $sp, 0x88 | |||
| addi.d $sp, $sp, 0x90 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,237 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define TD $r20 | |||
| #define TS $r11 | |||
| #define TL $r7 | |||
| #define T0 $r23 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| #define U4 $xr4 | |||
| #define U5 $xr5 | |||
| #define U6 $xr6 | |||
| #define U7 $xr7 | |||
| #define D0 $xr14 | |||
| #define D1 $xr8 | |||
| #define D2 $xr9 | |||
| #define D3 $xr10 | |||
| #define D4 $xr11 | |||
| #define D5 $xr12 | |||
| #define D6 $xr13 | |||
| #define D7 $xr15 | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -8 | |||
| SDARG $r23, $sp, 0 | |||
| move TD, DST | |||
| move TS, SRC | |||
| slli.d TL, LDA, 0x03 | |||
| slli.d T0, TL, 0x01 | |||
| srai.d J, N, 0x02 | |||
| beq J, ZERO, .L_N2 | |||
| .L_J1: /* J-- */ | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x02 | |||
| add.d S3, S2, TL | |||
| add.d S4, S2, T0 | |||
| add.d TS, S3, T0 | |||
| addi.d J, J, -1 | |||
| beq I, ZERO, .L_I3 | |||
| .L_I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvpackev.d D0, U1, U0 | |||
| xvpackod.d D1, U1, U0 | |||
| xvpackev.d D2, U3, U2 | |||
| xvpackod.d D3, U3, U2 | |||
| xvand.v U0, D0, D0 | |||
| xvpermi.q D0, D2, 0x02 // 0 | |||
| xvand.v U1, D1, D1 | |||
| xvpermi.q D1, D3, 0x02 // 1 | |||
| xvpermi.q D2, U0, 0x31 // 2 | |||
| xvpermi.q D3, U1, 0x31 // 3 | |||
| xvst D0, TD, 0x00 | |||
| xvst D1, TD, 0x20 | |||
| xvst D2, TD, 0x40 | |||
| xvst D3, TD, 0x60 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| addi.d TD, TD, 0x80 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_I1 | |||
| .L_I3: | |||
| andi I, M, 0x03 | |||
| beq I, ZERO, .L_I0 | |||
| .L_II1: | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| fst.d F2, TD, 0x10 | |||
| addi.d S3, S3, 0x08 | |||
| fst.d F3, TD, 0x18 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d TD, TD, 0x20 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_II1 | |||
| .L_I0: | |||
| blt ZERO, J, .L_J1 | |||
| .L_N2: | |||
| andi J, N, 0x02 | |||
| beq ZERO, J, .L_N1 | |||
| move S1, TS | |||
| add.d S2, TS, TL | |||
| srai.d I, M, 0x02 | |||
| add.d TS, S2, TL | |||
| beq I, ZERO, .L_2I3 | |||
| .L_2I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvpackev.d D0, U1, U0 | |||
| xvpackod.d D1, U1, U0 | |||
| xvand.v U0, D0, D0 | |||
| xvpermi.q D0, D1, 0x02 // 0 | |||
| xvpermi.q D1, U0, 0x31 // 1 | |||
| xvst D0, TD, 0x00 | |||
| xvst D1, TD, 0x20 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d TD, TD, 0x40 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_2I1 | |||
| .L_2I3: | |||
| andi I, M, 0x03 | |||
| beq ZERO, I, .L_N1 | |||
| .L_2II1: /* I-- */ | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d I, I, -1 | |||
| fst.d F1, TD, 0x08 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d TD, TD, 0x10 | |||
| blt ZERO, I, .L_2II1 | |||
| .L_N1: | |||
| andi J, N, 0x01 | |||
| beq ZERO, J, .L_N0 | |||
| move S1, TS | |||
| srai.d I, M, 0x02 | |||
| beq ZERO, I, .L_1I3 | |||
| .L_1I1: | |||
| xvld U0, S1, 0x00 | |||
| addi.d S1, S1, 0x20 | |||
| xvst U0, TD, 0x00 | |||
| addi.d I, I, -1 | |||
| addi.d TD, TD, 0x20 | |||
| blt ZERO, I, .L_1I1 | |||
| .L_1I3: | |||
| andi I, M, 0x03 | |||
| beq ZERO, I, .L_N0 | |||
| .L_1II1: | |||
| fld.d F0, S1, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| fst.d F0, TD, 0x00 | |||
| addi.d I, I, -1 | |||
| addi.d TD, TD, 0x08 | |||
| blt ZERO, I, .L_1II1 | |||
| .L_N0: | |||
| LDARG $r23, $sp, 0 | |||
| addi.d $sp, $sp, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,710 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S0 $r11 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define P0 $r20 | |||
| #define P1 $r23 | |||
| #define P2 $r24 | |||
| #define P3 $r25 | |||
| #define P4 $r26 | |||
| #define P5 $r27 | |||
| #define T0 $r28 | |||
| #define T1 $r29 | |||
| #define TL $r7 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| #define U4 $xr4 | |||
| #define U5 $xr5 | |||
| #define U6 $xr6 | |||
| #define U7 $xr7 | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -56 | |||
| SDARG $r23, $sp, 0 | |||
| SDARG $r24, $sp, 8 | |||
| SDARG $r25, $sp, 16 | |||
| SDARG $r26, $sp, 24 | |||
| SDARG $r27, $sp, 32 | |||
| SDARG $r28, $sp, 40 | |||
| SDARG $r29, $sp, 48 | |||
| move S0, SRC | |||
| move P0, DST | |||
| srai.d T0, N, 0x04 | |||
| srai.d T1, N, 0x03 | |||
| slli.d T0, T0, 0x04 | |||
| slli.d T1, T1, 0x03 | |||
| mul.d P2, M, T0 | |||
| mul.d P3, M, T1 | |||
| slli.d P2, P2, 0x03 | |||
| slli.d P3, P3, 0x03 | |||
| add.d P2, DST, P2 | |||
| add.d P3, DST, P3 | |||
| srai.d T0, N, 0x02 | |||
| srai.d T1, N, 0x01 | |||
| slli.d T0, T0, 0x02 | |||
| slli.d T1, T1, 0x01 | |||
| mul.d P4, M, T0 | |||
| mul.d P5, M, T1 | |||
| slli.d P4, P4, 0x03 | |||
| slli.d P5, P5, 0x03 | |||
| add.d P4, DST, P4 | |||
| add.d P5, DST, P5 | |||
| slli.d TL, LDA, 0x03 | |||
| srai.d J, M, 0x03 | |||
| slli.d T0, TL, 0x01 | |||
| slli.d T1, M, 0x07 | |||
| beq ZERO, J, .L_M7 | |||
| .L_J1: /* J-- */ | |||
| move S1, S0 | |||
| add.d S2, S0, TL | |||
| add.d S3, S1, T0 | |||
| add.d S4, S2, T0 | |||
| add.d S5, S3, T0 | |||
| add.d S6, S4, T0 | |||
| add.d S7, S5, T0 | |||
| add.d S8, S6, T0 | |||
| add.d S0, S7, T0 | |||
| move P1, P0 | |||
| addi.d P0, P0, 0x400 | |||
| srai.d I, N, 0x04 | |||
| addi.d J, J, -1 | |||
| beq ZERO, I, .L_N15 | |||
| .L_I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S1, 0x40 | |||
| xvld U3, S1, 0x60 | |||
| xvld U4, S2, 0x00 | |||
| xvld U5, S2, 0x20 | |||
| xvld U6, S2, 0x40 | |||
| xvld U7, S2, 0x60 | |||
| xvst U0, P1, 0x00 | |||
| xvst U1, P1, 0x20 | |||
| xvst U2, P1, 0x40 | |||
| xvst U3, P1, 0x60 | |||
| xvst U4, P1, 0x80 | |||
| xvst U5, P1, 0xA0 | |||
| xvst U6, P1, 0xC0 | |||
| xvst U7, P1, 0xE0 | |||
| xvld U0, S3, 0x00 | |||
| xvld U1, S3, 0x20 | |||
| xvld U2, S3, 0x40 | |||
| xvld U3, S3, 0x60 | |||
| xvld U4, S4, 0x00 | |||
| xvld U5, S4, 0x20 | |||
| xvld U6, S4, 0x40 | |||
| xvld U7, S4, 0x60 | |||
| xvst U0, P1, 0x100 | |||
| xvst U1, P1, 0x120 | |||
| xvst U2, P1, 0x140 | |||
| xvst U3, P1, 0x160 | |||
| xvst U4, P1, 0x180 | |||
| xvst U5, P1, 0x1A0 | |||
| xvst U6, P1, 0x1C0 | |||
| xvst U7, P1, 0x1E0 | |||
| xvld U0, S5, 0x00 | |||
| xvld U1, S5, 0x20 | |||
| xvld U2, S5, 0x40 | |||
| xvld U3, S5, 0x60 | |||
| xvld U4, S6, 0x00 | |||
| xvld U5, S6, 0x20 | |||
| xvld U6, S6, 0x40 | |||
| xvld U7, S6, 0x60 | |||
| xvst U0, P1, 0x200 | |||
| xvst U1, P1, 0x220 | |||
| xvst U2, P1, 0x240 | |||
| xvst U3, P1, 0x260 | |||
| xvst U4, P1, 0x280 | |||
| xvst U5, P1, 0x2A0 | |||
| xvst U6, P1, 0x2C0 | |||
| xvst U7, P1, 0x2E0 | |||
| xvld U0, S7, 0x00 | |||
| xvld U1, S7, 0x20 | |||
| xvld U2, S7, 0x40 | |||
| xvld U3, S7, 0x60 | |||
| xvld U4, S8, 0x00 | |||
| xvld U5, S8, 0x20 | |||
| xvld U6, S8, 0x40 | |||
| xvld U7, S8, 0x60 | |||
| xvst U0, P1, 0x300 | |||
| xvst U1, P1, 0x320 | |||
| xvst U2, P1, 0x340 | |||
| xvst U3, P1, 0x360 | |||
| xvst U4, P1, 0x380 | |||
| xvst U5, P1, 0x3A0 | |||
| xvst U6, P1, 0x3C0 | |||
| xvst U7, P1, 0x3E0 | |||
| addi.d S1, S1, 0x80 | |||
| addi.d S2, S2, 0x80 | |||
| addi.d S3, S3, 0x80 | |||
| addi.d S4, S4, 0x80 | |||
| addi.d S5, S5, 0x80 | |||
| addi.d S6, S6, 0x80 | |||
| addi.d S7, S7, 0x80 | |||
| addi.d S8, S8, 0x80 | |||
| addi.d I, I, -1 | |||
| add.d P1, P1, T1 | |||
| blt ZERO, I, .L_I1 | |||
| .L_N15: | |||
| andi I, N, 0x08 | |||
| beq ZERO, I, .L_N7 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S2, 0x00 | |||
| xvld U3, S2, 0x20 | |||
| xvld U4, S3, 0x00 | |||
| xvld U5, S3, 0x20 | |||
| xvld U6, S4, 0x00 | |||
| xvld U7, S4, 0x20 | |||
| xvst U0, P2, 0x00 | |||
| xvst U1, P2, 0x20 | |||
| xvst U2, P2, 0x40 | |||
| xvst U3, P2, 0x60 | |||
| xvst U4, P2, 0x80 | |||
| xvst U5, P2, 0xA0 | |||
| xvst U6, P2, 0xC0 | |||
| xvst U7, P2, 0xE0 | |||
| xvld U0, S5, 0x00 | |||
| xvld U1, S5, 0x20 | |||
| xvld U2, S6, 0x00 | |||
| xvld U3, S6, 0x20 | |||
| xvld U4, S7, 0x00 | |||
| xvld U5, S7, 0x20 | |||
| xvld U6, S8, 0x00 | |||
| xvld U7, S8, 0x20 | |||
| xvst U0, P2, 0x100 | |||
| xvst U1, P2, 0x120 | |||
| xvst U2, P2, 0x140 | |||
| xvst U3, P2, 0x160 | |||
| xvst U4, P2, 0x180 | |||
| xvst U5, P2, 0x1A0 | |||
| xvst U6, P2, 0x1C0 | |||
| xvst U7, P2, 0x1E0 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d S2, S2, 0x40 | |||
| addi.d S3, S3, 0x40 | |||
| addi.d S4, S4, 0x40 | |||
| addi.d S5, S5, 0x40 | |||
| addi.d S6, S6, 0x40 | |||
| addi.d S7, S7, 0x40 | |||
| addi.d S8, S8, 0x40 | |||
| addi.d P2, P2, 0x200 | |||
| .L_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_N3 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| xvst U0, P3, 0x00 | |||
| xvst U1, P3, 0x20 | |||
| xvst U2, P3, 0x40 | |||
| xvst U3, P3, 0x60 | |||
| xvst U4, P3, 0x80 | |||
| xvst U5, P3, 0xA0 | |||
| xvst U6, P3, 0xC0 | |||
| xvst U7, P3, 0xE0 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| addi.d S5, S5, 0x20 | |||
| addi.d S6, S6, 0x20 | |||
| addi.d S7, S7, 0x20 | |||
| addi.d S8, S8, 0x20 | |||
| addi.d P3, P3, 0x100 | |||
| .L_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_N1 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| xvpermi.q U0, U1, 0x02 | |||
| xvpermi.q U2, U3, 0x02 | |||
| xvpermi.q U4, U5, 0x02 | |||
| xvpermi.q U6, U7, 0x02 | |||
| xvst U0, P4, 0x00 | |||
| xvst U2, P4, 0x20 | |||
| xvst U4, P4, 0x40 | |||
| xvst U6, P4, 0x60 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d S3, S3, 0x10 | |||
| addi.d S4, S4, 0x10 | |||
| addi.d S5, S5, 0x10 | |||
| addi.d S6, S6, 0x10 | |||
| addi.d S7, S7, 0x10 | |||
| addi.d S8, S8, 0x10 | |||
| addi.d P4, P4, 0x80 | |||
| .L_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_N0 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fld.d F4, S5, 0x00 | |||
| fld.d F5, S6, 0x00 | |||
| fld.d F6, S7, 0x00 | |||
| fld.d F7, S8, 0x00 | |||
| fst.d F0, P5, 0x00 | |||
| fst.d F1, P5, 0x08 | |||
| fst.d F2, P5, 0x10 | |||
| fst.d F3, P5, 0x18 | |||
| fst.d F4, P5, 0x20 | |||
| fst.d F5, P5, 0x28 | |||
| fst.d F6, P5, 0x30 | |||
| fst.d F7, P5, 0x38 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d S3, S3, 0x08 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d S5, S5, 0x08 | |||
| addi.d S6, S6, 0x08 | |||
| addi.d S7, S7, 0x08 | |||
| addi.d S8, S8, 0x08 | |||
| addi.d P5, P5, 0x40 | |||
| .L_N0: | |||
| blt ZERO, J, .L_J1 | |||
| .L_M7: | |||
| andi J, M, 0x04 | |||
| beq ZERO, J, .L_M3 | |||
| move S1, S0 | |||
| add.d S2, S0, TL | |||
| add.d S3, S1, T0 | |||
| add.d S4, S2, T0 | |||
| add.d S0, S3, T0 | |||
| move P1, P0 | |||
| addi.d P0, P0, 0x200 | |||
| srai.d I, N, 0x04 | |||
| beq ZERO, I, .L_4N15 | |||
| .L_4I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S1, 0x40 | |||
| xvld U3, S1, 0x60 | |||
| xvld U4, S2, 0x00 | |||
| xvld U5, S2, 0x20 | |||
| xvld U6, S2, 0x40 | |||
| xvld U7, S2, 0x60 | |||
| xvst U0, P1, 0x00 | |||
| xvst U1, P1, 0x20 | |||
| xvst U2, P1, 0x40 | |||
| xvst U3, P1, 0x60 | |||
| xvst U4, P1, 0x80 | |||
| xvst U5, P1, 0xA0 | |||
| xvst U6, P1, 0xC0 | |||
| xvst U7, P1, 0xE0 | |||
| xvld U0, S3, 0x00 | |||
| xvld U1, S3, 0x20 | |||
| xvld U2, S3, 0x40 | |||
| xvld U3, S3, 0x60 | |||
| xvld U4, S4, 0x00 | |||
| xvld U5, S4, 0x20 | |||
| xvld U6, S4, 0x40 | |||
| xvld U7, S4, 0x60 | |||
| xvst U0, P1, 0x100 | |||
| xvst U1, P1, 0x120 | |||
| xvst U2, P1, 0x140 | |||
| xvst U3, P1, 0x160 | |||
| xvst U4, P1, 0x180 | |||
| xvst U5, P1, 0x1A0 | |||
| xvst U6, P1, 0x1C0 | |||
| xvst U7, P1, 0x1E0 | |||
| addi.d S1, S1, 0x80 | |||
| addi.d S2, S2, 0x80 | |||
| addi.d S3, S3, 0x80 | |||
| addi.d S4, S4, 0x80 | |||
| addi.d I, I, -1 | |||
| add.d P1, P1, T1 | |||
| blt ZERO, I, .L_4I1 | |||
| .L_4N15: | |||
| andi I, N, 0x08 | |||
| beq ZERO, I, .L_4N7 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S2, 0x00 | |||
| xvld U3, S2, 0x20 | |||
| xvld U4, S3, 0x00 | |||
| xvld U5, S3, 0x20 | |||
| xvld U6, S4, 0x00 | |||
| xvld U7, S4, 0x20 | |||
| xvst U0, P2, 0x00 | |||
| xvst U1, P2, 0x20 | |||
| xvst U2, P2, 0x40 | |||
| xvst U3, P2, 0x60 | |||
| xvst U4, P2, 0x80 | |||
| xvst U5, P2, 0xA0 | |||
| xvst U6, P2, 0xC0 | |||
| xvst U7, P2, 0xE0 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d S2, S2, 0x40 | |||
| addi.d S3, S3, 0x40 | |||
| addi.d S4, S4, 0x40 | |||
| addi.d P2, P2, 0x100 | |||
| .L_4N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_4N3 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvst U0, P3, 0x00 | |||
| xvst U1, P3, 0x20 | |||
| xvst U2, P3, 0x40 | |||
| xvst U3, P3, 0x60 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| addi.d P3, P3, 0x80 | |||
| .L_4N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_4N1 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvpermi.q U0, U1, 0x02 | |||
| xvpermi.q U2, U3, 0x02 | |||
| xvst U0, P4, 0x00 | |||
| xvst U2, P4, 0x20 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d S3, S3, 0x10 | |||
| addi.d S4, S4, 0x10 | |||
| addi.d P4, P4, 0x40 | |||
| .L_4N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M3 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fst.d F0, P5, 0x00 | |||
| fst.d F1, P5, 0x08 | |||
| fst.d F2, P5, 0x10 | |||
| fst.d F3, P5, 0x18 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d S3, S3, 0x08 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d P5, P5, 0x20 | |||
| .L_M3: | |||
| andi J, M, 0x02 | |||
| beq ZERO, J, .L_M1 | |||
| move S1, S0 | |||
| add.d S2, S0, TL | |||
| add.d S0, S0, T0 | |||
| move P1, P0 | |||
| addi.d P0, P0, 0x100 | |||
| srai.d I, N, 0x04 | |||
| beq ZERO, I, .L_2N15 | |||
| .L_2I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S1, 0x40 | |||
| xvld U3, S1, 0x60 | |||
| xvld U4, S2, 0x00 | |||
| xvld U5, S2, 0x20 | |||
| xvld U6, S2, 0x40 | |||
| xvld U7, S2, 0x60 | |||
| xvst U0, P1, 0x00 | |||
| xvst U1, P1, 0x20 | |||
| xvst U2, P1, 0x40 | |||
| xvst U3, P1, 0x60 | |||
| xvst U4, P1, 0x80 | |||
| xvst U5, P1, 0xA0 | |||
| xvst U6, P1, 0xC0 | |||
| xvst U7, P1, 0xE0 | |||
| addi.d S1, S1, 0x80 | |||
| addi.d S2, S2, 0x80 | |||
| addi.d I, I, -1 | |||
| add.d P1, P1, T1 | |||
| blt ZERO, I, .L_2I1 | |||
| .L_2N15: | |||
| andi I, N, 0x08 | |||
| beq ZERO, I, .L_2N7 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S2, 0x00 | |||
| xvld U3, S2, 0x20 | |||
| xvst U0, P2, 0x00 | |||
| xvst U1, P2, 0x20 | |||
| xvst U2, P2, 0x40 | |||
| xvst U3, P2, 0x60 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d S2, S2, 0x40 | |||
| addi.d P2, P2, 0x80 | |||
| .L_2N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_2N3 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvst U0, P3, 0x00 | |||
| xvst U1, P3, 0x20 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d P3, P3, 0x40 | |||
| .L_2N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_2N1 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvpermi.q U0, U1, 0x02 | |||
| xvst U0, P4, 0x00 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d P4, P4, 0x20 | |||
| .L_2N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M1 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fst.d F0, P5, 0x00 | |||
| fst.d F1, P5, 0x08 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d P5, P5, 0x10 | |||
| .L_M1: | |||
| andi J, M, 0x01 | |||
| beq ZERO, J, .L_M0 | |||
| move S1, S0 | |||
| add.d S2, S0, TL | |||
| move P1, P0 | |||
| addi.d P0, P0, 0x80 | |||
| srai.d I, N, 0x04 | |||
| beq ZERO, I, .L_1N15 | |||
| .L_1I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S1, 0x40 | |||
| xvld U3, S1, 0x60 | |||
| xvst U0, P1, 0x00 | |||
| xvst U1, P1, 0x20 | |||
| xvst U2, P1, 0x40 | |||
| xvst U3, P1, 0x60 | |||
| addi.d S1, S1, 0x80 | |||
| addi.d I, I, -1 | |||
| add.d P1, P1, T1 | |||
| blt ZERO, I, .L_1I1 | |||
| .L_1N15: | |||
| andi I, N, 0x08 | |||
| beq ZERO, I, .L_1N7 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvst U0, P2, 0x00 | |||
| xvst U1, P2, 0x20 | |||
| addi.d S1, S1, 0x40 | |||
| addi.d P2, P2, 0x40 | |||
| .L_1N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_1N3 | |||
| xvld U0, S1, 0x00 | |||
| xvst U0, P3, 0x00 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d P3, P3, 0x20 | |||
| .L_1N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_1N1 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S1, 0x08 | |||
| fst.d F0, P4, 0x00 | |||
| fst.d F1, P4, 0x08 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d P4, P4, 0x10 | |||
| .L_1N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M0 | |||
| fld.d F0, S1, 0x00 | |||
| fst.d F0, P5, 0x00 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d P5, P5, 0x08 | |||
| .L_M0: | |||
| LDARG $r23, $sp, 0 | |||
| LDARG $r24, $sp, 8 | |||
| LDARG $r25, $sp, 16 | |||
| LDARG $r26, $sp, 24 | |||
| LDARG $r27, $sp, 32 | |||
| LDARG $r28, $sp, 40 | |||
| LDARG $r29, $sp, 48 | |||
| addi.d $sp, $sp, 56 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,270 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S0 $r11 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define P0 $r16 | |||
| #define P1 $r17 | |||
| #define P2 $r18 | |||
| #define P3 $r19 | |||
| #define T0 $r20 | |||
| #define T1 $r23 | |||
| #define TL $r7 | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| PROLOGUE | |||
| addi.d $sp, $sp, -8 | |||
| SDARG $r23, $sp, 0 | |||
| move S0, SRC | |||
| move P0, DST | |||
| srai.d T0, N, 0x02 | |||
| slli.d T0, T0, 0x02 | |||
| srai.d T1, N, 0x01 | |||
| slli.d T1, T1, 0x01 | |||
| mul.d T0, M, T0 | |||
| mul.d T1, M, T1 | |||
| slli.d T0, T0, 0x03 | |||
| slli.d T1, T1, 0x03 | |||
| add.d P2, DST, T0 | |||
| add.d P3, DST, T1 | |||
| slli.d TL, LDA, 0x03 | |||
| srai.d J, M, 0x02 | |||
| slli.d T0, TL, 0x01 | |||
| slli.d T1, M, 0x05 | |||
| beq ZERO, J, .L_M3 | |||
| .L_J1: /* J-- */ | |||
| move S1, S0 | |||
| add.d S2, S0, TL | |||
| add.d S3, S1, T0 | |||
| add.d S4, S2, T0 | |||
| add.d S0, S3, T0 | |||
| move P1, P0 | |||
| addi.d P0, P0, 0x80 | |||
| srai.d I, N, 0x02 | |||
| addi.d J, J, -1 | |||
| beq ZERO, I, .L_N3 | |||
| .L_I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvst U0, P1, 0x00 | |||
| xvst U1, P1, 0x20 | |||
| xvst U2, P1, 0x40 | |||
| xvst U3, P1, 0x60 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d S3, S3, 0x20 | |||
| addi.d S4, S4, 0x20 | |||
| add.d P1, P1, T1 | |||
| addi.d I, I, -1 | |||
| blt ZERO, I, .L_I1 | |||
| .L_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_N1 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvpermi.q U0, U1, 0x02 | |||
| xvpermi.q U2, U3, 0x02 | |||
| xvst U0, P2, 0x00 | |||
| xvst U2, P2, 0x20 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d S3, S3, 0x10 | |||
| addi.d S4, S4, 0x10 | |||
| addi.d P2, P2, 0x40 | |||
| .L_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_N0 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fld.d F2, S3, 0x00 | |||
| fld.d F3, S4, 0x00 | |||
| fst.d F0, P3, 0x00 | |||
| fst.d F1, P3, 0x08 | |||
| fst.d F2, P3, 0x10 | |||
| fst.d F3, P3, 0x18 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d S3, S3, 0x08 | |||
| addi.d S4, S4, 0x08 | |||
| addi.d P3, P3, 0x20 | |||
| .L_N0: | |||
| blt ZERO, J, .L_J1 | |||
| .L_M3: | |||
| andi J, M, 0x02 | |||
| beq ZERO, J, .L_M1 | |||
| move S1, S0 | |||
| add.d S2, S0, TL | |||
| add.d S0, S0, T0 | |||
| move P1, P0 | |||
| addi.d P0, P0, 0x40 | |||
| srai.d I, N, 0x02 | |||
| beq ZERO, I, .L_2N3 | |||
| .L_2I1: /* I-- */ | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvst U0, P1, 0x00 | |||
| xvst U1, P1, 0x20 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d S2, S2, 0x20 | |||
| addi.d I, I, -1 | |||
| add.d P1, P1, T1 | |||
| blt ZERO, I, .L_2I1 | |||
| .L_2N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_2N1 | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvpermi.q U0, U1, 0x02 | |||
| xvst U0, P2, 0x00 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d S2, S2, 0x10 | |||
| addi.d P2, P2, 0x20 | |||
| .L_2N1: | |||
| addi.d I, N, 0x01 | |||
| beq ZERO, I, .L_M1 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S2, 0x00 | |||
| fst.d F0, P3, 0x00 | |||
| fst.d F1, P3, 0x08 | |||
| addi.d S1, S1, 0x08 | |||
| addi.d S2, S2, 0x08 | |||
| addi.d P3, P3, 0x10 | |||
| .L_M1: | |||
| andi J, M, 0x01 | |||
| beq ZERO, J, .L_M0 | |||
| move S1, S0 | |||
| move P1, P0 | |||
| srai.d I, N, 0x02 | |||
| beq ZERO, I, .L_1N3 | |||
| .L_1I1: | |||
| xvld U0, S1, 0x00 | |||
| xvst U0, P1, 0x00 | |||
| addi.d S1, S1, 0x20 | |||
| addi.d I, I, -1 | |||
| add.d P1, P1, T1 | |||
| blt ZERO, I, .L_1I1 | |||
| .L_1N3: | |||
| andi I, N, 0x02 | |||
| beq I, ZERO, .L_1N1 | |||
| fld.d F0, S1, 0x00 | |||
| fld.d F1, S1, 0x08 | |||
| fst.d F0, P2, 0x00 | |||
| fst.d F1, P2, 0x08 | |||
| addi.d S1, S1, 0x10 | |||
| addi.d P2, P2, 0x10 | |||
| .L_1N1: | |||
| andi I, N, 0x01 | |||
| beq I, ZERO, .L_M0 | |||
| fld.d F0, S1, 0x00 | |||
| fst.d F0, P3, 0x00 | |||
| .L_M0: | |||
| LDARG $r23, $sp, 0 | |||
| addi.d $sp, $sp, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -39,11 +39,19 @@ IZAMINKERNEL = izamax.S | |||
| endif | |||
| ifndef ISMINKERNEL | |||
| ISMINKERNEL = iamax.S | |||
| ISMINKERNEL = imax.S | |||
| endif | |||
| ifndef IDMINKERNEL | |||
| IDMINKERNEL = iamax.S | |||
| IDMINKERNEL = imax.S | |||
| endif | |||
| ifndef ISMAXKERNEL | |||
| ISMAXKERNEL = imax.S | |||
| endif | |||
| ifndef IDMAXKERNEL | |||
| IDMAXKERNEL = imax.S | |||
| endif | |||
| ifndef SNRM2KERNEL | |||
| @@ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||
| #endif | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, | |||
| NULL, 0, result, 0, (void *)asum_thread_function, nthreads); | |||
| NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); | |||
| ptr = (FLOAT *)result; | |||
| for (i = 0; i < nthreads; i++) { | |||
| sumf += (*ptr); | |||
| @@ -114,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| #else | |||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||
| #endif | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); | |||
| ptr = (FLOAT *)result; | |||
| for (i = 0; i < nthreads; i++) { | |||
| sumf += (*ptr); | |||
| @@ -190,7 +190,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #endif | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, | |||
| x, inc_x, y, inc_y, result, 0, | |||
| ( void *)dot_thread_function, nthreads); | |||
| (int (*)(void)) dot_thread_function, nthreads); | |||
| ptr = (RETURN_TYPE *)result; | |||
| for (i = 0; i < nthreads; i++) { | |||
| @@ -196,7 +196,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| #else | |||
| int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; | |||
| #endif | |||
| blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); | |||
| blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads); | |||
| } | |||
| #else | |||
| rot_compute(n, x, inc_x, y, inc_y, c, s); | |||
| @@ -123,7 +123,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| #else | |||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||
| #endif | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); | |||
| ptr = (FLOAT *)result; | |||
| for (i = 0; i < nthreads; i++) { | |||
| sumf += (*ptr); | |||
| @@ -198,7 +198,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| #else | |||
| int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD; | |||
| #endif | |||
| blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads); | |||
| blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (int (*)(void))rot_thread_function, nthreads); | |||
| } | |||
| #else | |||
| rot_compute(n, x, inc_x, y, inc_y, c, s); | |||
| @@ -130,7 +130,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||
| #endif | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, | |||
| NULL, 0, result, 0, (void *)asum_thread_function, nthreads); | |||
| NULL, 0, result, 0, (int (*)(void))asum_thread_function, nthreads); | |||
| ptr = (FLOAT *)result; | |||
| for (i = 0; i < nthreads; i++) { | |||
| sumf += (*ptr); | |||
| @@ -215,7 +215,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, | |||
| x, inc_x, y, inc_y, result, 0, | |||
| ( void *)zdot_thread_function, nthreads); | |||
| (int (*)(void))zdot_thread_function, nthreads); | |||
| ptr = (OPENBLAS_COMPLEX_FLOAT *)result; | |||
| for (i = 0; i < nthreads; i++) { | |||
| @@ -97,8 +97,6 @@ | |||
| *> \author Univ. of Colorado Denver | |||
| *> \author NAG Ltd. | |||
| * | |||
| *> \date December 2016 | |||
| * | |||
| *> \ingroup complexGEcomputational | |||
| * | |||
| *> \par Further Details: | |||
| @@ -127,10 +125,9 @@ | |||
| * ===================================================================== | |||
| SUBROUTINE CGEQRT2( M, N, A, LDA, T, LDT, INFO ) | |||
| * | |||
| * -- LAPACK computational routine (version 3.7.0) -- | |||
| * -- LAPACK computational routine -- | |||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
| * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | |||
| * December 2016 | |||
| * | |||
| * .. Scalar Arguments .. | |||
| INTEGER INFO, LDA, LDT, M, N | |||
| @@ -157,10 +154,10 @@ | |||
| * Test the input arguments | |||
| * | |||
| INFO = 0 | |||
| IF( M.LT.0 ) THEN | |||
| INFO = -1 | |||
| ELSE IF( N.LT.0 ) THEN | |||
| IF( N.LT.0 ) THEN | |||
| INFO = -2 | |||
| ELSE IF( M.LT.N ) THEN | |||
| INFO = -1 | |||
| ELSE IF( LDA.LT.MAX( 1, M ) ) THEN | |||
| INFO = -4 | |||
| ELSE IF( LDT.LT.MAX( 1, N ) ) THEN | |||
| @@ -97,8 +97,6 @@ | |||
| *> \author Univ. of Colorado Denver | |||
| *> \author NAG Ltd. | |||
| * | |||
| *> \date December 2016 | |||
| * | |||
| *> \ingroup doubleGEcomputational | |||
| * | |||
| *> \par Further Details: | |||
| @@ -127,10 +125,9 @@ | |||
| * ===================================================================== | |||
| SUBROUTINE DGEQRT2( M, N, A, LDA, T, LDT, INFO ) | |||
| * | |||
| * -- LAPACK computational routine (version 3.7.0) -- | |||
| * -- LAPACK computational routine -- | |||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
| * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | |||
| * December 2016 | |||
| * | |||
| * .. Scalar Arguments .. | |||
| INTEGER INFO, LDA, LDT, M, N | |||
| @@ -157,10 +154,10 @@ | |||
| * Test the input arguments | |||
| * | |||
| INFO = 0 | |||
| IF( M.LT.0 ) THEN | |||
| INFO = -1 | |||
| ELSE IF( N.LT.0 ) THEN | |||
| IF( N.LT.0 ) THEN | |||
| INFO = -2 | |||
| ELSE IF( M.LT.N ) THEN | |||
| INFO = -1 | |||
| ELSE IF( LDA.LT.MAX( 1, M ) ) THEN | |||
| INFO = -4 | |||
| ELSE IF( LDT.LT.MAX( 1, N ) ) THEN | |||
| @@ -97,8 +97,6 @@ | |||
| *> \author Univ. of Colorado Denver | |||
| *> \author NAG Ltd. | |||
| * | |||
| *> \date December 2016 | |||
| * | |||
| *> \ingroup realGEcomputational | |||
| * | |||
| *> \par Further Details: | |||
| @@ -127,10 +125,9 @@ | |||
| * ===================================================================== | |||
| SUBROUTINE SGEQRT2( M, N, A, LDA, T, LDT, INFO ) | |||
| * | |||
| * -- LAPACK computational routine (version 3.7.0) -- | |||
| * -- LAPACK computational routine -- | |||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
| * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | |||
| * December 2016 | |||
| * | |||
| * .. Scalar Arguments .. | |||
| INTEGER INFO, LDA, LDT, M, N | |||
| @@ -157,10 +154,10 @@ | |||
| * Test the input arguments | |||
| * | |||
| INFO = 0 | |||
| IF( M.LT.0 ) THEN | |||
| INFO = -1 | |||
| ELSE IF( N.LT.0 ) THEN | |||
| IF( N.LT.0 ) THEN | |||
| INFO = -2 | |||
| ELSE IF( M.LT.N ) THEN | |||
| INFO = -1 | |||
| ELSE IF( LDA.LT.MAX( 1, M ) ) THEN | |||
| INFO = -4 | |||
| ELSE IF( LDT.LT.MAX( 1, N ) ) THEN | |||
| @@ -97,8 +97,6 @@ | |||
| *> \author Univ. of Colorado Denver | |||
| *> \author NAG Ltd. | |||
| * | |||
| *> \date December 2016 | |||
| * | |||
| *> \ingroup complex16GEcomputational | |||
| * | |||
| *> \par Further Details: | |||
| @@ -127,10 +125,9 @@ | |||
| * ===================================================================== | |||
| SUBROUTINE ZGEQRT2( M, N, A, LDA, T, LDT, INFO ) | |||
| * | |||
| * -- LAPACK computational routine (version 3.7.0) -- | |||
| * -- LAPACK computational routine -- | |||
| * -- LAPACK is a software package provided by Univ. of Tennessee, -- | |||
| * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- | |||
| * December 2016 | |||
| * | |||
| * .. Scalar Arguments .. | |||
| INTEGER INFO, LDA, LDT, M, N | |||
| @@ -157,10 +154,10 @@ | |||
| * Test the input arguments | |||
| * | |||
| INFO = 0 | |||
| IF( M.LT.0 ) THEN | |||
| INFO = -1 | |||
| ELSE IF( N.LT.0 ) THEN | |||
| IF( N.LT.0 ) THEN | |||
| INFO = -2 | |||
| ELSE IF( M.LT.N ) THEN | |||
| INFO = -1 | |||
| ELSE IF( LDA.LT.MAX( 1, M ) ) THEN | |||
| INFO = -4 | |||
| ELSE IF( LDT.LT.MAX( 1, N ) ) THEN | |||
| @@ -66,6 +66,7 @@ ZMATGEN = zlatms.o zlatme.o zlatmr.o zlatmt.o \ | |||
| endif | |||
| .PHONY: all | |||
| .NOTPARALLEL: | |||
| all: $(TMGLIB) | |||
| ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \ | |||
| @@ -662,7 +662,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, | |||
| blas_level1_thread(mode, bk, is + bk + offset + 1, mn + offset, (void *)dummyalpha, | |||
| a + (- offset + is * lda) * COMPSIZE, lda, NULL, 0, | |||
| ipiv, 1, (void *)LASWP_PLUS, args -> nthreads); | |||
| ipiv, 1, (int (*)(void))LASWP_PLUS, args -> nthreads); | |||
| is += bk; | |||
| } | |||
| @@ -57,10 +57,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG | |||
| a--; | |||
| k1 --; | |||
| #ifndef MINUS | |||
| ipiv += k1; | |||
| #else | |||
| ipiv -= (k2 - 1) * incx; | |||
| #ifdef MINUS | |||
| ipiv -= (k2 - k1 - 1) * incx; | |||
| #endif | |||
| if (n <= 0) return 0; | |||
| @@ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG | |||
| a--; | |||
| k1 --; | |||
| #ifndef MINUS | |||
| ipiv += k1; | |||
| #else | |||
| ipiv -= (k2 - 1) * incx; | |||
| #ifdef MINUS | |||
| ipiv -= (k2 - k1 - 1) * incx; | |||
| #endif | |||
| if (n <= 0) return 0; | |||
| @@ -65,10 +65,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG | |||
| a--; | |||
| k1 --; | |||
| #ifndef MINUS | |||
| ipiv += k1; | |||
| #else | |||
| ipiv -= (k2 - 1) * incx; | |||
| #ifdef MINUS | |||
| ipiv -= (k2 - k1 - 1) * incx; | |||
| #endif | |||
| if (n <= 0) return 0; | |||
| @@ -78,10 +78,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG | |||
| a--; | |||
| k1 --; | |||
| #ifndef MINUS | |||
| ipiv += k1; | |||
| #else | |||
| ipiv -= (k2 - 1) * incx; | |||
| #ifdef MINUS | |||
| ipiv -= (k2 - k1 - 1) * incx; | |||
| #endif | |||
| if (n <= 0) return 0; | |||
| @@ -59,10 +59,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, | |||
| lda *= 2; | |||
| k1 --; | |||
| #ifndef MINUS | |||
| ipiv += k1; | |||
| #else | |||
| ipiv -= (k2 - 1) * incx; | |||
| #ifdef MINUS | |||
| ipiv -= (k2 - k1 - 1) * incx; | |||
| #endif | |||
| if (n <= 0) return 0; | |||
| @@ -60,10 +60,9 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT dummy4, | |||
| lda *= 2; | |||
| k1 --; | |||
| #ifndef MINUS | |||
| ipiv += k1; | |||
| #else | |||
| ipiv -= (k2 - 1) * incx; | |||
| #ifdef MINUS | |||
| ipiv -= (k2 - k1 - 1) * incx; | |||
| #endif | |||
| if (n <= 0) return 0; | |||