| @@ -1,33 +1,38 @@ | |||
| # XXX: Precise is already deprecated, new default is Trusty. | |||
| # https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming | |||
| dist: precise | |||
| dist: focal | |||
| sudo: true | |||
| language: c | |||
| matrix: | |||
| include: | |||
| - &test-ubuntu | |||
| os: linux | |||
| # os: linux | |||
| compiler: gcc | |||
| addons: | |||
| apt: | |||
| packages: | |||
| - gfortran | |||
| # before_script: &common-before | |||
| # - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||
| # script: | |||
| # - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| # - make -C test $COMMON_FLAGS $BTYPE | |||
| # - make -C ctest $COMMON_FLAGS $BTYPE | |||
| # - make -C utest $COMMON_FLAGS $BTYPE | |||
| # env: | |||
| # - TARGET_BOX=LINUX64 | |||
| # - BTYPE="BINARY=64" | |||
| # | |||
| # - <<: *test-ubuntu | |||
| os: linux-ppc64le | |||
| before_script: &common-before | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" | |||
| script: | |||
| - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| - make -C test $COMMON_FLAGS $BTYPE | |||
| - make -C ctest $COMMON_FLAGS $BTYPE | |||
| - make -C utest $COMMON_FLAGS $BTYPE | |||
| env: | |||
| - TARGET_BOX=LINUX64 | |||
| - BTYPE="BINARY=64" | |||
| - <<: *test-ubuntu | |||
| os: linux-ppc64le | |||
| before_script: | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" | |||
| env: | |||
| # for matrix annotation only | |||
| - TARGET_BOX=PPC64LE_LINUX | |||
| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||
| project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 17.dev) | |||
| set(OpenBLAS_PATCH_VERSION 18.dev) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| # Adhere to GNU filesystem layout conventions | |||
| @@ -132,7 +132,7 @@ endif () | |||
| if (BUILD_BFLOAT16) | |||
| message(STATUS "Building Half Precision") | |||
| list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing | |||
| # list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing | |||
| endif () | |||
| if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") | |||
| @@ -1,4 +1,47 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.18 | |||
| 02-Oct-2021 | |||
| general: | |||
| - when the build-time number of preconfigured threads is exceeded | |||
| at runtime (typically by an external program calling BLAS functions | |||
| from a larger number of threads in parallel), OpenBLAS will now | |||
| allocate an auxiliary control structure for up to 512 additional | |||
| threads instead of aborting | |||
| - added support for Loongson's LoongArch64 cpu architecture | |||
| - fixed building OpenBLAS with CMAKE and -DBUILD_BFLOAT16=ON | |||
| - added support for building OpenBLAS as a CMAKE subproject | |||
| - added support for building for Windows/ARM64 targets with clang | |||
| - improved support for building with the IBM xlf compiler | |||
| - imported Reference-LAPACK PR 625 (out-of-bounds reads in ?LARRV) | |||
| - imported Reference-LAPACK PR 597 for testsuite compatibility with | |||
| LLVM's libomp | |||
| x86_64: | |||
| - added SkylakeX S/DGEMM kernels for small problem sizes (M*N*K<=1000000) | |||
| - added optimized SBGEMM for Intel Cooper Lake | |||
| - reinstated the performance patch for AVX512 SGEMV_T with a proper fix | |||
| - added a workaround for a gcc11 tree-vectorizer bug that caused spurious | |||
| failures in the test programs for complex BLAS3 when compiling at -O3 | |||
| (the default for cmake "release" builds) | |||
| - added support for runtime cpu count detection under Haiku OS | |||
| - worked around a long-standing miscompilation issue of the Haswell DGEMV_T | |||
| kernel with gcc that could produce NaN output in some corner cases | |||
| POWER: | |||
| - improved performance of DASUM on POWER10 | |||
| ARMV8: | |||
| - fixed crashes (use of reserved register x18) on Apple M1 under OSX | |||
| - fixed building with gcc releases earlier than 5.1 | |||
| MIPS: | |||
| - fixed building under BSD | |||
| MIPS64: | |||
| - fixed building under BSD | |||
| ==================================================================== | |||
| Version 0.3.17 | |||
| 15-Jul-2021 | |||
| @@ -269,7 +269,7 @@ prof_lapack : lapack_prebuild | |||
| lapack_prebuild : | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| @@ -12,9 +12,13 @@ endif | |||
| ifeq ($(CORE), POWER10) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math | |||
| ifeq ($(F_COMPILER), IBM) | |||
| FCOMMON_OPT += -O2 -qrecur -qnosave | |||
| else | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| @@ -33,7 +37,11 @@ else | |||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | |||
| endif | |||
| ifneq ($(F_COMPILER), PGI) | |||
| ifeq ($(F_COMPILER), IBM) | |||
| FCOMMON_OPT += -O2 -qrecur -qnosave | |||
| else | |||
| FCOMMON_OPT += -O2 -frecursive -fno-fast-math | |||
| endif | |||
| ifeq ($(C_COMPILER), GCC) | |||
| ifneq ($(GCCVERSIONGT4), 1) | |||
| $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) | |||
| @@ -57,7 +65,11 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | |||
| endif | |||
| ifneq ($(F_COMPILER), PGI) | |||
| ifeq ($(OSNAME), AIX) | |||
| ifeq ($(F_COMPILER), IBM) | |||
| FCOMMON_OPT += -O2 -qrecur -qnosave | |||
| else | |||
| FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math | |||
| endif | |||
| else | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math | |||
| endif | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.17.dev | |||
| VERSION = 0.3.18.dev | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -16,6 +16,8 @@ else | |||
| HOSTARCH = $(ARCH) | |||
| endif | |||
| HAVE_GAS := $(shell as -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null) | |||
| # Catch conflicting usage of ARCH in some BSD environments | |||
| ifeq ($(ARCH), amd64) | |||
| override ARCH=x86_64 | |||
| @@ -33,6 +35,10 @@ else ifeq ($(ARCH), armv7) | |||
| override ARCH=arm | |||
| else ifeq ($(ARCH), aarch64) | |||
| override ARCH=arm64 | |||
| else ifeq ($(ARCH), mipsel) | |||
| override ARCH=mips | |||
| else ifeq ($(ARCH), mips64el) | |||
| override ARCH=mips64 | |||
| else ifeq ($(ARCH), zarch) | |||
| override ARCH=zarch | |||
| endif | |||
| @@ -303,7 +309,7 @@ else | |||
| SMP = 1 | |||
| endif | |||
| else | |||
| ifeq ($(NUM_THREAD), 1) | |||
| ifeq ($(NUM_THREADS), 1) | |||
| SMP = | |||
| else | |||
| SMP = 1 | |||
| @@ -128,6 +128,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||
| - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. | |||
| - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. | |||
| - **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. | |||
| - **Intel Cooper Lake**: as Skylake-X with improved BFLOAT16 support. | |||
| - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | |||
| - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) | |||
| - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. | |||
| @@ -153,6 +154,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||
| - **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS | |||
| - **Cortex-A53**: same as ARMV8 (different cpu specifications) | |||
| - **Cortex-A55**: same as ARMV8 (different cpu specifications) | |||
| - **Cortex A57**: Optimized Level-3 and Level-2 functions | |||
| - **Cortex A72**: same as A57 ( different cpu specifications) | |||
| - **Cortex A73**: same as A57 (different cpu specifications) | |||
| @@ -178,10 +180,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||
| #### RISC-V | |||
| - **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. | |||
| - **C910V**: Optimized Level-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. | |||
| ```sh | |||
| make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran | |||
| ``` | |||
| (also known to work on C906) | |||
| ### Support for multiple targets in a single library | |||
| @@ -19,7 +19,7 @@ jobs: | |||
| # of gcc / glibc | |||
| - job: manylinux1_gcc | |||
| pool: | |||
| vmImage: 'ubuntu-16.04' | |||
| vmImage: 'ubuntu-latest' | |||
| steps: | |||
| - script: | | |||
| echo "FROM quay.io/pypa/manylinux1_x86_64 | |||
| @@ -35,7 +35,7 @@ jobs: | |||
| displayName: Run manylinux1 docker build | |||
| - job: Intel_SDE_skx | |||
| pool: | |||
| vmImage: 'ubuntu-16.04' | |||
| vmImage: 'ubuntu-latest' | |||
| steps: | |||
| - script: | | |||
| # at the time of writing the available Azure Ubuntu vm image | |||
| @@ -213,8 +213,9 @@ jobs: | |||
| vmImage: 'ubuntu-latest' | |||
| steps: | |||
| - script: | | |||
| wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ | |||
| && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1 | |||
| wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.1/alpine-chroot-install \ | |||
| && echo '7c7e3fa378e69aecc7f5f01bbc759e5f0a9d9b74 alpine-chroot-install' | sha1sum -c \ | |||
| || exit 1 | |||
| alpine() { /alpine/enter-chroot -u "$USER" "$@"; } | |||
| sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' | |||
| alpine make DYNAMIC_ARCH=1 BINARY=64 | |||
| @@ -104,7 +104,7 @@ endif () | |||
| if (${F_COMPILER} STREQUAL "IBM") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM") | |||
| # FCOMMON_OPT += -qarch=440 | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -qrecur") | |||
| if (BINARY64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -q64") | |||
| if (INTERFACE64) | |||
| @@ -134,6 +134,8 @@ if (BUILD_BFLOAT16) | |||
| set(SHSWAPKERNEL ../arm/swap.c) | |||
| set(TOBF16KERNEL ../x86_64/tobf16.c) | |||
| set(BF16TOKERNEL ../x86_64/bf16to.c) | |||
| set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) | |||
| set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) | |||
| endif () | |||
| endmacro () | |||
| @@ -469,6 +469,9 @@ endif() | |||
| if (BUILD_COMPLEX16) | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16") | |||
| endif() | |||
| if (BUILD_BFLOAT16) | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_BFLOAT16") | |||
| endif() | |||
| if(NOT MSVC) | |||
| set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") | |||
| endif() | |||
| @@ -26,10 +26,12 @@ | |||
| *****************************************************************************/ | |||
| #include <string.h> | |||
| #ifdef OS_DARWIN | |||
| #ifdef __APPLE__ | |||
| #include <sys/sysctl.h> | |||
| int32_t value; | |||
| size_t length=sizeof(value); | |||
| int64_t value64; | |||
| size_t length64=sizeof(value64); | |||
| #endif | |||
| #define CPU_UNKNOWN 0 | |||
| @@ -212,9 +214,9 @@ int detect(void) | |||
| } | |||
| #else | |||
| #ifdef DARWIN | |||
| #ifdef __APPLE__ | |||
| sysctlbyname("hw.cpufamily",&value,&length,NULL,0); | |||
| if (value ==131287967) return CPU_VORTEX; | |||
| if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; | |||
| #endif | |||
| return CPU_ARMV8; | |||
| #endif | |||
| @@ -265,7 +267,7 @@ int n=0; | |||
| printf("#define NUM_CORES %d\n",n); | |||
| #endif | |||
| #ifdef DARWIN | |||
| #ifdef __APPLE__ | |||
| sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); | |||
| printf("#define NUM_CORES %d\n",value); | |||
| #endif | |||
| @@ -420,17 +422,19 @@ void get_cpuconfig(void) | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| #ifdef DARWIN | |||
| #ifdef __APPLE__ | |||
| case CPU_VORTEX: | |||
| printf("#define VORTEX \n"); | |||
| sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); | |||
| printf("#define L1_CODE_SIZE %d \n",value); | |||
| sysctlbyname("hw.cachelinesize",&value,&length,NULL,0); | |||
| printf("#define L1_CODE_LINESIZE %d \n",value); | |||
| sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0); | |||
| printf("#define L1_DATA_SIZE %d \n",value); | |||
| sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); | |||
| printf("#define L2_SIZE %d \n",value); | |||
| sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_CODE_SIZE %lld \n",value64); | |||
| sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_CODE_LINESIZE %lld \n",value64); | |||
| sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_DATA_SIZE %lld \n",value64); | |||
| sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); | |||
| printf("#define L2_SIZE %lld \n",value64); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| #endif | |||
| } | |||
| @@ -81,6 +81,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||
| GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type}) | |||
| endif () | |||
| # special defines for complex | |||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | |||
| foreach (u_source ${U_SOURCES}) | |||
| @@ -197,6 +198,13 @@ foreach (float_type ${FLOAT_TYPES}) | |||
| endif () | |||
| endforeach () | |||
| if (BUILD_BFLOAT16) | |||
| if (USE_THREAD) | |||
| GenerateNamedObjects("sbgemv_thread.c" "" "gemv_thread_n" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("sbgemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "BFLOAT16") | |||
| endif () | |||
| endif () | |||
| if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) | |||
| if (USE_THREAD) | |||
| GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE") | |||
| @@ -12,6 +12,12 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) | |||
| if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) | |||
| GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0) | |||
| endif () | |||
| if (BUILD_BFLOAT16) | |||
| GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") | |||
| if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) | |||
| GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") | |||
| endif () | |||
| endif () | |||
| endforeach () | |||
| if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | |||
| @@ -6,10 +6,6 @@ extern gotoblas_t gotoblas_POWER8; | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| extern gotoblas_t gotoblas_POWER9; | |||
| #endif | |||
| //#if (!defined __GNUC__) || ( __GNUC__ >= 11) \ | |||
| // || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) | |||
| //#define HAVE_P10_SUPPORT 1 | |||
| //#endif | |||
| #ifdef HAVE_P10_SUPPORT | |||
| extern gotoblas_t gotoblas_POWER10; | |||
| #endif | |||
| @@ -2695,7 +2695,7 @@ static volatile struct { | |||
| } memory[NUM_BUFFERS]; | |||
| static volatile struct newmemstruct | |||
| struct newmemstruct | |||
| { | |||
| BLASULONG lock; | |||
| void *addr; | |||
| @@ -524,6 +524,9 @@ void blas_set_parameter(void){ | |||
| xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M; | |||
| #endif | |||
| #ifdef BUILD_BFLOAT16 | |||
| sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; | |||
| #endif | |||
| sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; | |||
| dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; | |||
| cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; | |||
| @@ -629,7 +632,9 @@ void blas_set_parameter(void){ | |||
| xgemm_p = 16 * (size + 1); | |||
| #endif | |||
| #ifdef BUILD_BFLOAT16 | |||
| sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; | |||
| #endif | |||
| sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; | |||
| dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; | |||
| cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; | |||
| @@ -313,6 +313,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #ifdef NO_AVX | |||
| #define SUBARCHITECTURE "NEHALEM" | |||
| #define ARCHCONFIG "-DNEHALEM " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" | |||
| #define LIBNAME "nehalem" | |||
| #define CORENAME "NEHALEM" | |||
| #else | |||
| #define SUBARCHITECTURE "SANDYBRIDGE" | |||
| #define ARCHCONFIG "-DSANDYBRIDGE " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| @@ -322,12 +332,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define LIBNAME "sandybridge" | |||
| #define CORENAME "SANDYBRIDGE" | |||
| #endif | |||
| #endif | |||
| #ifdef FORCE_HASWELL | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #ifdef NO_AVX2 | |||
| #ifdef NO_AVX | |||
| #define SUBARCHITECTURE "NEHALEM" | |||
| #define ARCHCONFIG "-DNEHALEM " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" | |||
| #define LIBNAME "nehalem" | |||
| #define CORENAME "NEHALEM" | |||
| #else | |||
| #define SUBARCHITECTURE "SANDYBRIDGE" | |||
| #define ARCHCONFIG "-DSANDYBRIDGE " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| @@ -336,6 +357,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | |||
| #define LIBNAME "sandybridge" | |||
| #define CORENAME "SANDYBRIDGE" | |||
| #endif | |||
| #else | |||
| #define SUBARCHITECTURE "HASWELL" | |||
| #define ARCHCONFIG "-DHASWELL " \ | |||
| @@ -350,10 +372,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #ifdef FORCE_SKYLAKEX | |||
| #ifdef NO_AVX512 | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #ifdef NO_AVX512 | |||
| #ifdef NO_AVX2 | |||
| #ifdef NO_AVX | |||
| #define SUBARCHITECTURE "NEHALEM" | |||
| #define ARCHCONFIG "-DNEHALEM " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" | |||
| #define LIBNAME "nehalem" | |||
| #define CORENAME "NEHALEM" | |||
| #else | |||
| #define SUBARCHITECTURE "SANDYBRIDGE" | |||
| #define ARCHCONFIG "-DSANDYBRIDGE " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | |||
| #define LIBNAME "sandybridge" | |||
| #define CORENAME "SANDYBRIDGE" | |||
| #endif | |||
| #else | |||
| #define SUBARCHITECTURE "HASWELL" | |||
| #define ARCHCONFIG "-DHASWELL " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| @@ -363,10 +406,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" | |||
| #define LIBNAME "haswell" | |||
| #define CORENAME "HASWELL" | |||
| #endif | |||
| #else | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "SKYLAKEX" | |||
| #define ARCHCONFIG "-DSKYLAKEX " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| @@ -380,10 +421,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #ifdef FORCE_COOPERLAKE | |||
| #ifdef NO_AVX512 | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #ifdef NO_AVX512 | |||
| #ifdef NO_AVX2 | |||
| #ifdef NO_AVX | |||
| #define SUBARCHITECTURE "NEHALEM" | |||
| #define ARCHCONFIG "-DNEHALEM " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" | |||
| #define LIBNAME "nehalem" | |||
| #define CORENAME "NEHALEM" | |||
| #else | |||
| #define SUBARCHITECTURE "SANDYBRIDGE" | |||
| #define ARCHCONFIG "-DSANDYBRIDGE " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | |||
| #define LIBNAME "sandybridge" | |||
| #define CORENAME "SANDYBRIDGE" | |||
| #endif | |||
| #else | |||
| #define SUBARCHITECTURE "HASWELL" | |||
| #define ARCHCONFIG "-DHASWELL " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| @@ -393,10 +455,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3" | |||
| #define LIBNAME "haswell" | |||
| #define CORENAME "HASWELL" | |||
| #endif | |||
| #else | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "COOPERLAKE" | |||
| #define ARCHCONFIG "-DCOOPERLAKE " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| @@ -564,6 +624,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #ifdef NO_AVX2 | |||
| #ifdef NO_AVX | |||
| #define SUBARCHITECTURE "NEHALEM" | |||
| #define ARCHCONFIG "-DNEHALEM " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" | |||
| #define LIBNAME "nehalem" | |||
| #define CORENAME "NEHALEM" | |||
| #else | |||
| #define SUBARCHITECTURE "SANDYBRIDGE" | |||
| #define ARCHCONFIG "-DSANDYBRIDGE " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| @@ -572,6 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" | |||
| #define LIBNAME "sandybridge" | |||
| #define CORENAME "SANDYBRIDGE" | |||
| #endif | |||
| #else | |||
| #define SUBARCHITECTURE "ZEN" | |||
| #define ARCHCONFIG "-DZEN " \ | |||
| @@ -82,6 +82,7 @@ foreach (CBLAS_FLAG ${CBLAS_FLAGS}) | |||
| GenerateNamedObjects("${BLAS3_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${DISABLE_COMPLEX}) | |||
| GenerateNamedObjects("${BLAS3_MANGLED_SOURCES}" "" "" ${CBLAS_FLAG} "" "" false ${MANGLE_COMPLEX}) | |||
| GenerateNamedObjects("xerbla.c" "" "xerbla" ${CBLAS_FLAG} "" "" true) | |||
| #sdsdot, dsdot | |||
| if (BUILD_SINGLE OR BUILD_DOUBLE) | |||
| GenerateNamedObjects("sdsdot.c" "" "sdsdot" ${CBLAS_FLAG} "" "" true "SINGLE") | |||
| @@ -104,6 +105,15 @@ endif () | |||
| GenerateNamedObjects("imax.c" "USE_ABS;USE_MIN" "i*amin" ${CBLAS_FLAG}) | |||
| GenerateNamedObjects("imax.c" "USE_MIN" "i*min" ${CBLAS_FLAG}) | |||
| if (BUILD_BFLOAT16) | |||
| GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("bf16to.c" "SINGLE_PREC" "sbf16tos" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| GenerateNamedObjects("bf16to.c" "DOUBLE_PREC" "dbf16tod" ${CBLAS_FLAG} "" "" true "BFLOAT16") | |||
| endif () | |||
| # complex-specific sources | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| @@ -326,7 +326,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| PRINT_DEBUG_CNAME; | |||
| #if !defined(COMPLEX) && !defined(DOUBLE) && defined(USE_SGEMM_KERNEL_DIRECT) | |||
| #if !defined(COMPLEX) && !defined(DOUBLE) && !defined(BFLOAT16) && defined(USE_SGEMM_KERNEL_DIRECT) | |||
| #ifdef DYNAMIC_ARCH | |||
| if (support_avx512() ) | |||
| #endif | |||
| @@ -119,7 +119,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, | |||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLOAT *x, int incx, FLOAT *a, int lda) { | |||
| FLOAT *buffer; | |||
| int trans, uplo; | |||
| int uplo; | |||
| blasint info; | |||
| FLOAT * ALPHA = α | |||
| FLOAT alpha_r = ALPHA[0]; | |||
| @@ -130,7 +130,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO | |||
| PRINT_DEBUG_CNAME; | |||
| trans = -1; | |||
| uplo = -1; | |||
| info = 0; | |||
| @@ -91,6 +91,15 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "d*dot_k" false "" "" false "SINGLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${DSDOTKERNEL}" "DSDOT" "dsdot_k" false "" "" false "SINGLE") | |||
| # sbdot | |||
| if (BUILD_BFLOAT16) | |||
| GenerateNamedObjects("${KERNELDIR}/${SBDOTKERNEL}" "SBDOT" "dot_k" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "SINGLE" "f16tos_k" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${BF16TOKERNEL}" "DOUBLE" "bf16tod_k" false "" "" false "DOUBLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "SINGLE" "stobf16_k" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${TOBF16KERNEL}" "DOUBLE" "dtobf16_k" false "" "" false "BFLOAT16") | |||
| endif() | |||
| if ((BUILD_COMPLEX OR BUILD_DOUBLE) AND NOT BUILD_SINGLE) | |||
| GenerateNamedObjects("${KERNELDIR}/${SAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "SINGLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${SAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "SINGLE") | |||
| @@ -149,9 +158,6 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "BFLOAT16") | |||
| set (float_char "SB") | |||
| endif () | |||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type}) | |||
| @@ -185,6 +191,10 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") | |||
| endif () | |||
| if (BUILD_BFLOAT16) | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMVNKERNEL}" "" "gemv_n" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMVTKERNEL}" "" "gemv_t" false "" "" false "BFLOAT16") | |||
| endif () | |||
| # Makefile.L3 | |||
| set(USE_TRMM false) | |||
| string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE) | |||
| @@ -209,15 +219,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) | |||
| endif() | |||
| foreach (float_type SINGLE DOUBLE BFLOAT16) | |||
| foreach (float_type SINGLE DOUBLE) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "BFLOAT16") | |||
| if (NOT ${BUILD_BFLOAT16}) | |||
| continue () | |||
| else () | |||
| set (float_char "SB") | |||
| endif () | |||
| endif () | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) | |||
| endforeach() | |||
| if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | |||
| @@ -253,11 +256,24 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMM_BETA}" "" "gemm_beta" false "" "" false "SINGLE") | |||
| endif () | |||
| if (BUILD_BFLOAT16) | |||
| if (SBGEMMINCOPY) | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMMINCOPY}" "" "${SBGEMMINCOPYOBJ}" false "" "" true "BFLOAT16") | |||
| endif () | |||
| if (SBGEMMITCOPY) | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMMITCOPY}" "" "${SBGEMMITCOPYOBJ}" false "" "" true "BFLOAT16") | |||
| endif () | |||
| if (SBGEMMONCOPY) | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMMONCOPY}" "" "${SBGEMMONCOPYOBJ}" false "" "" true "BFLOAT16") | |||
| endif () | |||
| if (SBGEMMOTCOPY) | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMMOTCOPY}" "" "${SBGEMMOTCOPYOBJ}" false "" "" true "BFLOAT16") | |||
| endif () | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16") | |||
| endif () | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "BFLOAT16") | |||
| set (float_char "SB") | |||
| endif () | |||
| if (${float_char}GEMMINCOPY) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) | |||
| endif () | |||
| @@ -568,6 +584,44 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false ${float_type}) | |||
| endif () | |||
| if (BUILD_BFLOAT16) | |||
| if (NOT DEFINED SBGEMM_SMALL_M_PERMIT) | |||
| set(SBGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c) | |||
| endif () | |||
| if (NOT DEFINED SBGEMM_SMALL_K_NN) | |||
| set(SBGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c) | |||
| endif () | |||
| if (NOT DEFINED SBGEMM_SMALL_K_NT) | |||
| set(SBGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c) | |||
| endif () | |||
| if (NOT DEFINED SBGEMM_SMALL_K_TN) | |||
| set(SBGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c) | |||
| endif () | |||
| if (NOT DEFINED SBGEMM_SMALL_K_TT) | |||
| set(SBGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c) | |||
| endif () | |||
| if (NOT DEFINED SBGEMM_SMALL_K_B0_NN) | |||
| set(SBGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c) | |||
| endif () | |||
| if (NOT DEFINED SBGEMM_SMALL_K_B0_NT) | |||
| set(SBGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c) | |||
| endif () | |||
| if (NOT DEFINED SBGEMM_SMALL_K_B0_TN) | |||
| set(SBGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c) | |||
| endif () | |||
| if (NOT DEFINED SBGEMM_SMALL_K_B0_TT) | |||
| set($SBGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c) | |||
| endif () | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_NT}" "" "gemm_small_kernel_tt" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16") | |||
| endif () | |||
| endif () | |||
| if (NOT DEFINED ${float_char}OMATCOPY_CN) | |||
| @@ -702,6 +756,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| #geadd | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) | |||
| endforeach () | |||
| if (BUILD_DOUBLE AND NOT BUILD_SINGLE) | |||
| GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE") | |||
| @@ -840,22 +895,22 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" ${TSUFFIX} false "SINGLE") | |||
| if (SGEMMINCOPY) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMINCOPY}" "SINGLE" "${SGEMMINCOPYOBJ}" false "" "" true "SINGLE") | |||
| endif () | |||
| if (SGEMMITCOPY) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") | |||
| endif () | |||
| if (SGEMMONCOPY) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") | |||
| endif () | |||
| if (SGEMMOTCOPY) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") | |||
| if (SGEMMITCOPY) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMITCOPY}" "SINGLE" "${SGEMMITCOPYOBJ}" false "" "" true "SINGLE") | |||
| endif () | |||
| if (SGEMMONCOPY) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMONCOPY}" "SINGLE" "${SGEMMONCOPYOBJ}" false "" "" true "SINGLE") | |||
| endif () | |||
| if (SGEMMOTCOPY) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMOTCOPY}" "SINGLE" "${SGEMMOTCOPYOBJ}" false "" "" true "SINGLE") | |||
| endif () | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMVNKERNEL}" "" "gemv_n" false "" "" false "SINGLE") | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "SINGLE") | |||
| endif () | |||
| if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | |||
| if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | |||
| GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "DOUBLE") | |||
| GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "DOUBLE") | |||
| endif () | |||
| @@ -50,11 +50,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define B03 x16 | |||
| #define B04 x17 | |||
| #define I x18 | |||
| #define J x19 | |||
| #define I x19 | |||
| #define J x20 | |||
| #define TEMP1 x20 | |||
| #define TEMP2 x21 | |||
| #define TEMP1 x21 | |||
| #define A_PREFETCH 2560 | |||
| #define B_PREFETCH 256 | |||
| @@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alpha x17 | |||
| #define temp x18 | |||
| //#define temp x18 | |||
| #define tempOffset x19 | |||
| #define tempK x20 | |||
| #define temp x21 | |||
| #define alpha0 d10 | |||
| #define alphaV0 v10.d[0] | |||
| @@ -30,7 +30,7 @@ All rights reserved. | |||
| #define B00 x22 | |||
| #define I x18 | |||
| #define I x21 | |||
| #define J x19 | |||
| #define TEMP1 x20 | |||
| @@ -49,9 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alpha w17 | |||
| #define temp x18 | |||
| //#define temp x18 | |||
| #define tempOffset x19 | |||
| #define tempK x20 | |||
| #define temp x21 | |||
| #define alpha0 s10 | |||
| #define alphaV0 v10.s[0] | |||
| @@ -48,8 +48,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow2 x14 | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alphaR x17 | |||
| #define alphaI x18 | |||
| #define alphaR x19 | |||
| #define alphaI x20 | |||
| #define alpha0_R d10 | |||
| #define alphaV0_R v10.d[0] | |||
| @@ -49,7 +49,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alphaR x17 | |||
| #define alphaI x18 | |||
| #define alphaI x22 | |||
| #define temp x19 | |||
| #define tempOffset x20 | |||
| #define tempK x21 | |||
| @@ -47,7 +47,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| int n1 = n & -4; | |||
| #if V_SIMD && !defined(DSDOT) | |||
| const int vstep = v_nlanes_f32; | |||
| const int unrollx4 = n & (-vstep * 4); | |||
| @@ -84,6 +83,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| } | |||
| dot = v_sum_f32(vsum0); | |||
| #elif defined(DSDOT) | |||
| int n1 = n & -4; | |||
| for (; i < n1; i += 4) | |||
| { | |||
| dot += (double) y[i] * (double) x[i] | |||
| @@ -92,6 +92,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| + (double) y[i+3] * (double) x[i+3] ; | |||
| } | |||
| #else | |||
| int n1 = n & -4; | |||
| for (; i < n1; i += 4) | |||
| { | |||
| dot += y[i] * x[i] | |||
| @@ -1,7 +1,6 @@ | |||
| ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) | |||
| ifeq ($(HAVE_GAS), 1) | |||
| include $(KERNELDIR)/KERNEL.POWER8 | |||
| else | |||
| #SGEMM_BETA = ../generic/gemm_beta.c | |||
| #DGEMM_BETA = ../generic/gemm_beta.c | |||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||
| @@ -44,6 +43,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_power10.S | |||
| #CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| @@ -218,5 +218,4 @@ QCABS_KERNEL = ../generic/cabs.c | |||
| #Dump kernel | |||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| endif | |||
| @@ -36,9 +36,12 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
| #endif | |||
| const float *mvecp = mvec; | |||
| /* We have to load reverse mask for big endian. */ | |||
| /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; | |||
| #else | |||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||
| #endif | |||
| long ytmp; | |||
| __asm__ | |||
| @@ -112,6 +115,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
| "xvmaddasp 38, 58, 33 \n\t" | |||
| "xvmaddasp 39, 59, 33 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 48, 0(%4) \n\t" | |||
| "stxv 49, 16(%4) \n\t" | |||
| "stxv 50, 32(%4) \n\t" | |||
| "stxv 51, 48(%4) \n\t" | |||
| "stxv 34, 64(%4) \n\t" | |||
| "stxv 35, 80(%4) \n\t" | |||
| "stxv 38, 96(%4) \n\t" | |||
| "stxv 39, 112(%4) \n\t" | |||
| #else | |||
| "stxv 49, 0(%4) \n\t" | |||
| "stxv 48, 16(%4) \n\t" | |||
| "stxv 51, 32(%4) \n\t" | |||
| @@ -120,6 +133,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
| "stxv 34, 80(%4) \n\t" | |||
| "stxv 39, 96(%4) \n\t" | |||
| "stxv 38, 112(%4) \n\t" | |||
| #endif | |||
| "addi %4, %4, 128 \n\t" | |||
| "xxperm 52, 40, %x10 \n\t" // exchange real and imag part | |||
| @@ -163,6 +177,16 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
| "xvmaddasp 38, 58, 33 \n\t" | |||
| "xvmaddasp 39, 59, 33 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 48, 0(%4) \n\t" | |||
| "stxv 49, 16(%4) \n\t" | |||
| "stxv 50, 32(%4) \n\t" | |||
| "stxv 51, 48(%4) \n\t" | |||
| "stxv 34, 64(%4) \n\t" | |||
| "stxv 35, 80(%4) \n\t" | |||
| "stxv 38, 96(%4) \n\t" | |||
| "stxv 39, 112(%4) \n\t" | |||
| #else | |||
| "stxv 49, 0(%4) \n\t" | |||
| "stxv 48, 16(%4) \n\t" | |||
| "stxv 51, 32(%4) \n\t" | |||
| @@ -171,6 +195,7 @@ static void caxpy_kernel_8 (long n, float *x, float *y, | |||
| "stxv 34, 80(%4) \n\t" | |||
| "stxv 39, 96(%4) \n\t" | |||
| "stxv 38, 112(%4) \n\t" | |||
| #endif | |||
| "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" | |||
| : | |||
| @@ -46,7 +46,16 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 32, 0(%3) \n\t" | |||
| "stxv 33, 16(%3) \n\t" | |||
| "stxv 34, 32(%3) \n\t" | |||
| "stxv 35, 48(%3) \n\t" | |||
| "stxv 36, 64(%3) \n\t" | |||
| "stxv 37, 80(%3) \n\t" | |||
| "stxv 38, 96(%3) \n\t" | |||
| "stxv 39, 112(%3) \n\t" | |||
| #else | |||
| "stxv 33, 0(%3) \n\t" | |||
| "stxv 32, 16(%3) \n\t" | |||
| "stxv 35, 32(%3) \n\t" | |||
| @@ -55,11 +64,21 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| "stxv 36, 80(%3) \n\t" | |||
| "stxv 39, 96(%3) \n\t" | |||
| "stxv 38, 112(%3) \n\t" | |||
| #endif | |||
| "lxvp 32, 0(%2) \n\t" | |||
| "lxvp 34, 32(%2) \n\t" | |||
| "lxvp 36, 64(%2) \n\t" | |||
| "lxvp 38, 96(%2) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 40, 128(%3) \n\t" | |||
| "stxv 41, 144(%3) \n\t" | |||
| "stxv 42, 160(%3) \n\t" | |||
| "stxv 43, 176(%3) \n\t" | |||
| "stxv 44, 192(%3) \n\t" | |||
| "stxv 45, 208(%3) \n\t" | |||
| "stxv 46, 224(%3) \n\t" | |||
| "stxv 47, 240(%3) \n\t" | |||
| #else | |||
| "stxv 41, 128(%3) \n\t" | |||
| "stxv 40, 144(%3) \n\t" | |||
| "stxv 43, 160(%3) \n\t" | |||
| @@ -68,6 +87,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| "stxv 44, 208(%3) \n\t" | |||
| "stxv 47, 224(%3) \n\t" | |||
| "stxv 46, 240(%3) \n\t" | |||
| #endif | |||
| "lxvp 40, 128(%2) \n\t" | |||
| "lxvp 42, 160(%2) \n\t" | |||
| "lxvp 44, 192(%2) \n\t" | |||
| @@ -81,7 +101,24 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 32, 0(%3) \n\t" | |||
| "stxv 33, 16(%3) \n\t" | |||
| "stxv 34, 32(%3) \n\t" | |||
| "stxv 35, 48(%3) \n\t" | |||
| "stxv 36, 64(%3) \n\t" | |||
| "stxv 37, 80(%3) \n\t" | |||
| "stxv 38, 96(%3) \n\t" | |||
| "stxv 39, 112(%3) \n\t" | |||
| "stxv 40, 128(%3) \n\t" | |||
| "stxv 41, 144(%3) \n\t" | |||
| "stxv 42, 160(%3) \n\t" | |||
| "stxv 43, 176(%3) \n\t" | |||
| "stxv 44, 192(%3) \n\t" | |||
| "stxv 45, 208(%3) \n\t" | |||
| "stxv 46, 224(%3) \n\t" | |||
| "stxv 47, 240(%3) \n\t" | |||
| #else | |||
| "stxv 33, 0(%3) \n\t" | |||
| "stxv 32, 16(%3) \n\t" | |||
| "stxv 35, 32(%3) \n\t" | |||
| @@ -98,7 +135,7 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y) | |||
| "stxv 44, 208(%3) \n\t" | |||
| "stxv 47, 224(%3) \n\t" | |||
| "stxv 46, 240(%3) \n\t" | |||
| #endif | |||
| "#n=%1 x=%4=%2 y=%0=%3" | |||
| : | |||
| "=m" (*y), | |||
| @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #include "common.h" | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| #include "cdot_microk_power10.c" | |||
| #else | |||
| #ifndef HAVE_KERNEL_8 | |||
| @@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| if ((inc_x == 1) && (inc_y == 1)) { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| BLASLONG n1 = n & -16; | |||
| #else | |||
| BLASLONG n1 = n & -8; | |||
| @@ -29,7 +29,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| static void cdot_kernel_8 (long n, float *x, float *y, float *dot) | |||
| { | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| __vector unsigned char mask = {4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||
| #else | |||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||
| #endif | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| @@ -153,7 +157,11 @@ static void cdot_kernel_8 (long n, float *x, float *y, float *dot) | |||
| "xxswapd 33, 34 \n\t" | |||
| "xvaddsp 35, 35, 32 \n\t" | |||
| "xvaddsp 34, 34, 33 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xxpermdi 34, 35, 34, 0 \n\t" | |||
| #else | |||
| "xxpermdi 34, 34, 35, 2 \n\t" | |||
| #endif | |||
| "stxv 34, 0(%6) \n\t" | |||
| "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6" | |||
| @@ -76,11 +76,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "cgemm_macros_power10.S" | |||
| #if (_AIX) | |||
| .set perm_const1, 0x0405060700010203 | |||
| .set perm_const2, 0x0c0d0e0f08090a0b | |||
| .set save_permute_12, 0x1011121300010203 | |||
| .set save_permute_11, 0x18191a1b08090a0b | |||
| #else | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||
| .equ save_permute_11, 0x0405060714151617 | |||
| #endif | |||
| #ifndef NEEDPARAM | |||
| @@ -172,24 +178,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| #if (_AIX) | |||
| lis T2, (perm_const2>>48 & 0xFFFF) | |||
| lis T1, (perm_const1>>48 & 0xFFFF) | |||
| lis T3, (save_permute_12>>48 & 0xFFFF) | |||
| lis T4, (save_permute_11>>48 & 0xFFFF) | |||
| ori T2, T2, (perm_const2>>32 & 0xFFFF) | |||
| ori T1, T1, (perm_const1>>32 & 0xFFFF) | |||
| ori T3, T3, (save_permute_12>>32 & 0xFFFF) | |||
| ori T4, T4, (save_permute_11>>32 & 0xFFFF) | |||
| #else | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| #endif | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| #if (_AIX) | |||
| oris T2, T2, (perm_const2>>16 & 0xFFFF) | |||
| oris T1, T1, (perm_const1>>16 & 0xFFFF) | |||
| oris T3, T3, (save_permute_12>>16 & 0xFFFF) | |||
| oris T4, T4, (save_permute_11>>16 & 0xFFFF) | |||
| ori T2, T2, (perm_const2 & 0xFFFF) | |||
| ori T1, T1, (perm_const1 & 0xFFFF) | |||
| ori T3, T3, (save_permute_12 & 0xFFFF) | |||
| ori T4, T4, (save_permute_11 & 0xFFFF) | |||
| #else | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| @@ -200,7 +226,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| #endif | |||
| li r0,0 | |||
| li PRE,512 | |||
| @@ -218,6 +218,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .if \OffsetA != 0 | |||
| addi \AREG, \AREG, \OffsetA | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 3, 36, 34 | |||
| xvf32gerpp 2, 37, 34 | |||
| xvf32gerpp 1, 32, 34 | |||
| xvf32gerpp 0, 33, 34 | |||
| xvf32gerpp 7, 36, 35 | |||
| xvf32gerpp 6, 37, 35 | |||
| xvf32gerpp 5, 32, 35 | |||
| xvf32gerpp 4, 33, 35 | |||
| #else | |||
| xvf32gerpp 3, 36, 35 | |||
| xvf32gerpp 2, 37, 35 | |||
| xvf32gerpp 1, 32, 35 | |||
| @@ -226,6 +236,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf32gerpp 6, 37, 34 | |||
| xvf32gerpp 5, 32, 34 | |||
| xvf32gerpp 4, 33, 34 | |||
| #endif | |||
| .endm | |||
| .macro LOAD4x8_2 | |||
| @@ -255,6 +266,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
| xvf32gerpp 3, 36, 34 | |||
| xvf32gerpp 2, 37, 34 | |||
| xvf32gerpp 1, 32, 34 | |||
| xvf32gerpp 0, 33, 34 | |||
| xvf32gerpp 7, 36, 35 | |||
| xvf32gerpp 6, 37, 35 | |||
| xvf32gerpp 5, 32, 35 | |||
| xvf32gerpp 4, 33, 35 | |||
| #else | |||
| xvf32gerpp 3, 36, 35 | |||
| xvf32gerpp 2, 37, 35 | |||
| xvf32gerpp 1, 32, 35 | |||
| @@ -263,11 +284,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf32gerpp 6, 37, 34 | |||
| xvf32gerpp 5, 32, 34 | |||
| xvf32gerpp 4, 33, 34 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) | |||
| lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) | |||
| lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) | |||
| .endif | |||
| #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
| xvf32gerpp 3, 42, 38 | |||
| xvf32gerpp 2, 43, 38 | |||
| xvf32gerpp 1, 40, 38 | |||
| xvf32gerpp 0, 41, 38 | |||
| xvf32gerpp 7, 42, 39 | |||
| xvf32gerpp 6, 43, 39 | |||
| xvf32gerpp 5, 40, 39 | |||
| xvf32gerpp 4, 41, 39 | |||
| #else | |||
| xvf32gerpp 3, 42, 39 | |||
| xvf32gerpp 2, 43, 39 | |||
| xvf32gerpp 1, 40, 39 | |||
| @@ -276,6 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf32gerpp 6, 43, 38 | |||
| xvf32gerpp 5, 40, 38 | |||
| xvf32gerpp 4, 41, 38 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs40, DISP16(\Index, 64+\OffsetA)(\AREG) | |||
| lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
| @@ -393,22 +426,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR2 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 1 | |||
| xxpermdi vs3, vs2, vs10, 1 | |||
| xxpermdi vs5, vs4, vs12, 1 | |||
| xxpermdi vs7, vs6, vs14, 1 | |||
| xxpermdi vs9, vs8, vs0, 1 | |||
| xxpermdi vs11, vs10, vs2, 1 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 2 | |||
| xxpermdi vs3, vs10, vs2, 2 | |||
| xxpermdi vs5, vs12, vs4, 2 | |||
| xxpermdi vs7, vs14, vs6, 2 | |||
| xxpermdi vs9, vs0, vs8, 2 | |||
| xxpermdi vs11, vs2, vs10, 2 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs3 | |||
| xvaddsp vs25, vs25, vs1 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs13, vs12, vs4, 1 | |||
| xxpermdi vs15, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs13, vs4, vs12, 2 | |||
| xxpermdi vs15, vs6, vs14, 2 | |||
| #endif | |||
| xvaddsp vs26, vs26, vs7 | |||
| xvaddsp vs27, vs27, vs5 | |||
| xvaddsp vs28, vs28, vs11 | |||
| xvaddsp vs29, vs29, vs9 | |||
| xvaddsp vs30, vs30, vs15 | |||
| xvaddsp vs31, vs31, vs13 | |||
| #else | |||
| #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
| xxpermdi vs25, vs0, vs8, 1 | |||
| xxpermdi vs24, vs2, vs10, 1 | |||
| xxpermdi vs27, vs4, vs12, 1 | |||
| xxpermdi vs26, vs6, vs14, 1 | |||
| xxpermdi vs29, vs8, vs0, 1 | |||
| xxpermdi vs28, vs10, vs2, 1 | |||
| xxpermdi vs31, vs12, vs4, 1 | |||
| xxpermdi vs30, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs25, vs8, vs0, 2 | |||
| xxpermdi vs24, vs10, vs2, 2 | |||
| @@ -418,6 +475,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs28, vs2, vs10, 2 | |||
| xxpermdi vs31, vs4, vs12, 2 | |||
| xxpermdi vs30, vs6, vs14, 2 | |||
| #endif | |||
| #endif | |||
| stxvp vs24, 0(CO) | |||
| MULT_APLHA_PART1 vs48, vs56, vs0, vs1 | |||
| @@ -443,22 +501,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR2 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 1 | |||
| xxpermdi vs3, vs2, vs10, 1 | |||
| xxpermdi vs5, vs4, vs12, 1 | |||
| xxpermdi vs7, vs6, vs14, 1 | |||
| xxpermdi vs9, vs8, vs0, 1 | |||
| xxpermdi vs11, vs10, vs2, 1 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 2 | |||
| xxpermdi vs3, vs10, vs2, 2 | |||
| xxpermdi vs5, vs12, vs4, 2 | |||
| xxpermdi vs7, vs14, vs6, 2 | |||
| xxpermdi vs9, vs0, vs8, 2 | |||
| xxpermdi vs11, vs2, vs10, 2 | |||
| #endif | |||
| xvaddsp vs32, vs32, vs3 | |||
| xvaddsp vs33, vs33, vs1 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs13, vs12, vs4, 1 | |||
| xxpermdi vs15, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs13, vs4, vs12, 2 | |||
| xxpermdi vs15, vs6, vs14, 2 | |||
| #endif | |||
| xvaddsp vs40, vs40, vs7 | |||
| xvaddsp vs41, vs41, vs5 | |||
| xvaddsp vs34, vs34, vs11 | |||
| xvaddsp vs35, vs35, vs9 | |||
| xvaddsp vs42, vs42, vs15 | |||
| xvaddsp vs43, vs43, vs13 | |||
| #else | |||
| #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
| xxpermdi vs33, vs0, vs8, 1 | |||
| xxpermdi vs32, vs2, vs10, 1 | |||
| xxpermdi vs41, vs4, vs12, 1 | |||
| xxpermdi vs40, vs6, vs14, 1 | |||
| xxpermdi vs35, vs8, vs0, 1 | |||
| xxpermdi vs34, vs10, vs2, 1 | |||
| xxpermdi vs43, vs12, vs4, 1 | |||
| xxpermdi vs42, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs33, vs8, vs0, 2 | |||
| xxpermdi vs32, vs10, vs2, 2 | |||
| @@ -468,6 +550,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs34, vs2, vs10, 2 | |||
| xxpermdi vs43, vs4, vs12, 2 | |||
| xxpermdi vs42, vs6, vs14, 2 | |||
| #endif | |||
| #endif | |||
| stxvp vs32, 0(T2) | |||
| stxvp vs40, 32(T2) | |||
| @@ -510,10 +593,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .if \OffsetA != 0 | |||
| addi \AREG, \AREG, \OffsetA | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 3, 32, 35 | |||
| xvf32gerpp 2, 33, 35 | |||
| xvf32gerpp 1, 32, 34 | |||
| xvf32gerpp 0, 33, 34 | |||
| #else | |||
| xvf32gerpp 3, 32, 34 | |||
| xvf32gerpp 2, 33, 34 | |||
| xvf32gerpp 1, 32, 35 | |||
| xvf32gerpp 0, 33, 35 | |||
| #endif | |||
| .endm | |||
| .macro LOAD4x4_2 | |||
| @@ -541,18 +631,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 3, 32, 35 | |||
| xvf32gerpp 2, 33, 35 | |||
| xvf32gerpp 1, 32, 34 | |||
| xvf32gerpp 0, 33, 34 | |||
| #else | |||
| xvf32gerpp 3, 32, 34 | |||
| xvf32gerpp 2, 33, 34 | |||
| xvf32gerpp 1, 32, 35 | |||
| xvf32gerpp 0, 33, 35 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP8(\Index, \OffsetB)(\BREG) | |||
| lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 3, 36, 39 | |||
| xvf32gerpp 2, 37, 39 | |||
| xvf32gerpp 1, 36, 38 | |||
| xvf32gerpp 0, 37, 38 | |||
| #else | |||
| xvf32gerpp 3, 36, 38 | |||
| xvf32gerpp 2, 37, 38 | |||
| xvf32gerpp 1, 36, 39 | |||
| xvf32gerpp 0, 37, 39 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs38, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
| lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) | |||
| @@ -606,6 +710,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR2 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 1 | |||
| xxpermdi vs3, vs2, vs10, 1 | |||
| xxpermdi vs9, vs8, vs0, 1 | |||
| xxpermdi vs11, vs10, vs2, 1 | |||
| xxpermdi vs5, vs4, vs12, 1 | |||
| xxpermdi vs7, vs6, vs14, 1 | |||
| xxpermdi vs13, vs12, vs4, 1 | |||
| xxpermdi vs15, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 2 | |||
| xxpermdi vs3, vs10, vs2, 2 | |||
| xxpermdi vs9, vs0, vs8, 2 | |||
| @@ -614,6 +728,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs7, vs14, vs6, 2 | |||
| xxpermdi vs13, vs4, vs12, 2 | |||
| xxpermdi vs15, vs6, vs14, 2 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs3 | |||
| xvaddsp vs25, vs25, vs1 | |||
| xvaddsp vs26, vs26, vs11 | |||
| @@ -622,6 +737,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvaddsp vs29, vs29, vs5 | |||
| xvaddsp vs30, vs30, vs15 | |||
| xvaddsp vs31, vs31, vs13 | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs25, vs0, vs8, 1 | |||
| xxpermdi vs24, vs2, vs10, 1 | |||
| xxpermdi vs27, vs8, vs0, 1 | |||
| xxpermdi vs26, vs10, vs2, 1 | |||
| xxpermdi vs29, vs4, vs12, 1 | |||
| xxpermdi vs28, vs6, vs14, 1 | |||
| xxpermdi vs31, vs12, vs4, 1 | |||
| xxpermdi vs30, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs25, vs8, vs0, 2 | |||
| xxpermdi vs24, vs10, vs2, 2 | |||
| @@ -631,6 +756,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs28, vs14, vs6, 2 | |||
| xxpermdi vs31, vs4, vs12, 2 | |||
| xxpermdi vs30, vs6, vs14, 2 | |||
| #endif | |||
| #endif | |||
| stxvp vs24, 0(CO) | |||
| stxvp vs26, 0(T1) | |||
| @@ -672,8 +798,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .if \OffsetA != 0 | |||
| addi \AREG, \AREG, \OffsetA | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 1, 35, 32 | |||
| xvf32gerpp 0, 34, 32 | |||
| #else | |||
| xvf32gerpp 1, 34, 32 | |||
| xvf32gerpp 0, 35, 32 | |||
| #endif | |||
| .endm | |||
| .macro LOAD4x2_2 | |||
| @@ -700,13 +831,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x2_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 1, 35, 32 | |||
| xvf32gerpp 0, 34, 32 | |||
| #else | |||
| xvf32gerpp 1, 34, 33 | |||
| xvf32gerpp 0, 35, 33 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 1, 37, 33 | |||
| xvf32gerpp 0, 36, 33 | |||
| #else | |||
| xvf32gerpp 1, 36, 32 | |||
| xvf32gerpp 0, 37, 32 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs32, DISP4(\Index, \OffsetA)(\AREG) | |||
| lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
| @@ -757,19 +898,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR1 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 0 | |||
| xxpermdi vs9, vs2, vs10, 0 | |||
| xxpermdi vs3, vs8, vs0, 3 | |||
| xxpermdi vs11, vs10, vs2, 3 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 0 | |||
| xxpermdi vs9, vs10, vs2, 0 | |||
| xxpermdi vs3, vs0, vs8, 3 | |||
| xxpermdi vs11, vs2, vs10, 3 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs1 | |||
| xvaddsp vs26, vs26, vs9 | |||
| xvaddsp vs25, vs25, vs3 | |||
| xvaddsp vs27, vs27, vs11 | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs24, vs0, vs8, 0 | |||
| xxpermdi vs26, vs2, vs10, 0 | |||
| xxpermdi vs25, vs8, vs0, 3 | |||
| xxpermdi vs27, vs10, vs2, 3 | |||
| #else | |||
| xxpermdi vs24, vs8, vs0, 0 | |||
| xxpermdi vs26, vs10, vs2, 0 | |||
| xxpermdi vs25, vs0, vs8, 3 | |||
| xxpermdi vs27, vs2, vs10, 3 | |||
| #endif | |||
| #endif | |||
| stxv vs24, 0(CO) | |||
| stxv vs25, 0(T1) | |||
| @@ -811,8 +966,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .if \OffsetA != 0 | |||
| addi \AREG, \AREG, \OffsetA | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 0, 34, 32 | |||
| xvf32gerpp 1, 35, 32 | |||
| #else | |||
| xvf32gerpp 0, 35, 32 | |||
| xvf32gerpp 1, 34, 32 | |||
| #endif | |||
| .endm | |||
| .macro LOAD4x1_2 | |||
| @@ -822,8 +982,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD4x1_2O OffsetA, OffsetB | |||
| lxv vs32, (\OffsetA)(AO) | |||
| vspltisb v6, 0 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs33, vs32, vs38, 2 | |||
| xxpermdi vs32, vs32, vs38, 0 | |||
| #else | |||
| xxpermdi vs33, vs32, vs38, 0 | |||
| xxpermdi vs32, vs32, vs38, 2 | |||
| #endif | |||
| lxvp vs34, (0+\OffsetB)(BO) | |||
| lxvp vs36, (32+\OffsetB)(BO) | |||
| .endm | |||
| @@ -842,18 +1007,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x1_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 0, 34, 32 | |||
| xvf32gerpp 1, 35, 32 | |||
| #else | |||
| xvf32gerpp 0, 35, 32 | |||
| xvf32gerpp 1, 34, 32 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP8(\Index, 0+\OffsetB)(\BREG) | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 0, 36, 33 | |||
| xvf32gerpp 1, 37, 33 | |||
| #else | |||
| xvf32gerpp 0, 37, 33 | |||
| xvf32gerpp 1, 36, 33 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxv vs32, DISP2(\Index, \OffsetA)(\AREG) | |||
| lxvp vs36, DISP8(\Index, 32+\OffsetB)(\BREG) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs33, vs32, vs38, 2 | |||
| xxpermdi vs32, vs32, vs38, 0 | |||
| #else | |||
| xxpermdi vs33, vs32, vs38, 0 | |||
| xxpermdi vs32, vs32, vs38, 2 | |||
| #endif | |||
| .endif | |||
| .if \IsLast==1 | |||
| .if \Complete==1 | |||
| @@ -1001,19 +1181,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL2x8_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 2, 37, 34 | |||
| xvf32gerpp 3, 36, 34 | |||
| xvf32gerpp 0, 33, 34 | |||
| xvf32gerpp 1, 32, 34 | |||
| #else | |||
| xvf32gerpp 2, 37, 35 | |||
| xvf32gerpp 3, 36, 35 | |||
| xvf32gerpp 0, 33, 35 | |||
| xvf32gerpp 1, 32, 35 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs32, DISP16(\Index, 0+\OffsetA)(\AREG) | |||
| lxvp vs36, DISP16(\Index, 32+\OffsetA)(\AREG) | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 2, 41, 35 | |||
| xvf32gerpp 3, 40, 35 | |||
| xvf32gerpp 0, 39, 35 | |||
| xvf32gerpp 1, 38, 35 | |||
| #else | |||
| xvf32gerpp 2, 41, 34 | |||
| xvf32gerpp 3, 40, 34 | |||
| xvf32gerpp 0, 39, 34 | |||
| xvf32gerpp 1, 38, 34 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) | |||
| @@ -1068,16 +1262,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR2 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 1 | |||
| xxpermdi vs3, vs2, vs10, 1 | |||
| xxpermdi vs5, vs4, vs12, 1 | |||
| xxpermdi vs7, vs6, vs14, 1 | |||
| xxpermdi vs9, vs8, vs0, 1 | |||
| xxpermdi vs11, vs10, vs2, 1 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 2 | |||
| xxpermdi vs3, vs10, vs2, 2 | |||
| xxpermdi vs5, vs12, vs4, 2 | |||
| xxpermdi vs7, vs14, vs6, 2 | |||
| xxpermdi vs9, vs0, vs8, 2 | |||
| xxpermdi vs11, vs2, vs10, 2 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs3 | |||
| xvaddsp vs25, vs25, vs1 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs13, vs12, vs4, 1 | |||
| xxpermdi vs15, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs13, vs4, vs12, 2 | |||
| xxpermdi vs15, vs6, vs14, 2 | |||
| #endif | |||
| xvaddsp vs26, vs26, vs7 | |||
| xvaddsp vs27, vs27, vs5 | |||
| xvaddsp vs28, vs28, vs11 | |||
| @@ -1085,6 +1293,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvaddsp vs30, vs30, vs15 | |||
| xvaddsp vs31, vs31, vs13 | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs25, vs0, vs8, 1 | |||
| xxpermdi vs24, vs2, vs10, 1 | |||
| xxpermdi vs27, vs4, vs12, 1 | |||
| xxpermdi vs26, vs6, vs14, 1 | |||
| xxpermdi vs29, vs8, vs0, 1 | |||
| xxpermdi vs28, vs10, vs2, 1 | |||
| xxpermdi vs31, vs12, vs4, 1 | |||
| xxpermdi vs30, vs14, vs6, 1 | |||
| #else | |||
| xxpermdi vs25, vs8, vs0, 2 | |||
| xxpermdi vs24, vs10, vs2, 2 | |||
| xxpermdi vs27, vs12, vs4, 2 | |||
| @@ -1093,6 +1311,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs28, vs2, vs10, 2 | |||
| xxpermdi vs31, vs4, vs12, 2 | |||
| xxpermdi vs30, vs6, vs14, 2 | |||
| #endif | |||
| #endif | |||
| stxvp vs24, 0(CO) | |||
| stxvp vs26, 32(CO) | |||
| @@ -1161,13 +1380,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL2x4_2 AREG, BREG, OffsetA, OffsetB, Index, IsLast, Complete | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 0, 33, 34 | |||
| xvf32gerpp 1, 32, 34 | |||
| #else | |||
| xvf32gerpp 0, 33, 35 | |||
| xvf32gerpp 1, 32, 35 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs32, DISP8(\Index, 0+\OffsetA)(\AREG) | |||
| .endif | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf32gerpp 0, 37, 35 | |||
| xvf32gerpp 1, 36, 35 | |||
| #else | |||
| xvf32gerpp 0, 37, 34 | |||
| xvf32gerpp 1, 36, 34 | |||
| #endif | |||
| .if \Complete==0 | |||
| lxvp vs34, DISP4(\Index, \OffsetB)(\BREG) | |||
| lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) | |||
| @@ -1206,19 +1436,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RECONSTRUCT_PAIR1 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 1 | |||
| xxpermdi vs3, vs2, vs10, 1 | |||
| xxpermdi vs9, vs8, vs0, 1 | |||
| xxpermdi vs11, vs10, vs2, 1 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 2 | |||
| xxpermdi vs3, vs10, vs2, 2 | |||
| xxpermdi vs9, vs0, vs8, 2 | |||
| xxpermdi vs11, vs2, vs10, 2 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs3 | |||
| xvaddsp vs25, vs25, vs1 | |||
| xvaddsp vs26, vs26, vs11 | |||
| xvaddsp vs27, vs27, vs9 | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs25, vs0, vs8, 1 | |||
| xxpermdi vs24, vs2, vs10, 1 | |||
| xxpermdi vs27, vs8, vs0, 1 | |||
| xxpermdi vs26, vs10, vs2, 1 | |||
| #else | |||
| xxpermdi vs25, vs8, vs0, 2 | |||
| xxpermdi vs24, vs10, vs2, 2 | |||
| xxpermdi vs27, vs0, vs8, 2 | |||
| xxpermdi vs26, vs2, vs10, 2 | |||
| #endif | |||
| #endif | |||
| stxvp vs24, 0(CO) | |||
| stxvp vs26, 0(T1) | |||
| @@ -1330,13 +1574,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxperm vs8, vs9, save_permute_1 | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs1, vs0, vs8, 0 | |||
| xxpermdi vs9, vs8, vs0, 3 | |||
| #else | |||
| xxpermdi vs1, vs8, vs0, 0 | |||
| xxpermdi vs9, vs0, vs8, 3 | |||
| #endif | |||
| xvaddsp vs24, vs24, vs1 | |||
| xvaddsp vs26, vs26, vs9 | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs24, vs0, vs8, 0 | |||
| xxpermdi vs26, vs8, vs0, 3 | |||
| #else | |||
| xxpermdi vs24, vs8, vs0, 0 | |||
| xxpermdi vs26, vs0, vs8, 3 | |||
| #endif | |||
| #endif | |||
| stxv vs24, 0(CO) | |||
| stxv vs26, 0(T1) | |||
| @@ -1528,8 +1782,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvp vs32, (0+\OffsetA)(AO) | |||
| lxvp vs36, (32+\OffsetA)(AO) | |||
| vspltisb v10, 0 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs35, vs34, vs42, 2 | |||
| xxpermdi vs34, vs34, vs42, 0 | |||
| #else | |||
| xxpermdi vs35, vs34, vs42, 0 | |||
| xxpermdi vs34, vs34, vs42, 2 | |||
| #endif | |||
| lxvp vs38, (64+\OffsetA)(AO) | |||
| lxvp vs40, (64+32+\OffsetA)(AO) | |||
| .endm | |||
| @@ -1567,8 +1826,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf32gerpp 3, 35, 40 | |||
| .if \Complete==0 | |||
| lxv vs34, DISP2(\Index, \OffsetB)(\BREG) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs35, vs34, vs42, 2 | |||
| xxpermdi vs34, vs34, vs42, 0 | |||
| #else | |||
| xxpermdi vs35, vs34, vs42, 0 | |||
| xxpermdi vs34, vs34, vs42, 2 | |||
| #endif | |||
| lxvp vs40, DISP16(\Index, 64+32+\OffsetA)(\AREG) | |||
| .endif | |||
| .if \IsLast==1 | |||
| @@ -1634,10 +1898,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MULT_APLHA_PART2 vs34, vs42, vs4, vs5 | |||
| MULT_APLHA_PART2 vs35, vs43, vs6, vs7 | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxperm vs0, vs1, save_permute_1 | |||
| xxperm vs2, vs3, save_permute_1 | |||
| xxperm vs4, vs5, save_permute_1 | |||
| xxperm vs6, vs7, save_permute_1 | |||
| #else | |||
| xxperm vs0, vs1, vs28 | |||
| xxperm vs2, vs3, vs28 | |||
| xxperm vs4, vs5, vs28 | |||
| xxperm vs6, vs7, vs28 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| xvaddsp vs24, vs24, vs2 | |||
| @@ -1648,10 +1919,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stxvp vs26, 32(CO) | |||
| #else | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| stxv vs2, 0(CO) | |||
| stxv vs0, 16(CO) | |||
| stxv vs6, 32(CO) | |||
| stxv vs4, 48(CO) | |||
| #else | |||
| stxv vs0, 0(CO) | |||
| stxv vs2, 16(CO) | |||
| stxv vs4, 32(CO) | |||
| stxv vs6, 48(CO) | |||
| #endif | |||
| #endif | |||
| addi CO, CO, 64 | |||
| .endm | |||
| @@ -1701,8 +1979,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxv vs34, (\OffsetB)(BO) | |||
| lxvp vs32, (0+\OffsetA)(AO) | |||
| vspltisb v6, 0 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs35, vs34, vs38, 2 | |||
| xxpermdi vs34, vs34, vs38, 0 | |||
| #else | |||
| xxpermdi vs35, vs34, vs38, 0 | |||
| xxpermdi vs34, vs34, vs38, 2 | |||
| #endif | |||
| lxvp vs36, (32+\OffsetA)(AO) | |||
| .endm | |||
| @@ -1729,8 +2012,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf32gerpp 1, 35, 36 | |||
| .if \Complete==0 | |||
| lxv vs34, DISP2(\Index, \OffsetB)(\BREG) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxpermdi vs35, vs34, vs38, 2 | |||
| xxpermdi vs34, vs34, vs38, 0 | |||
| #else | |||
| xxpermdi vs35, vs34, vs38, 0 | |||
| xxpermdi vs34, vs34, vs38, 2 | |||
| #endif | |||
| lxvp vs36, DISP8(\Index, 32+\OffsetA)(\AREG) | |||
| .endif | |||
| .if \IsLast==1 | |||
| @@ -1775,8 +2063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MULT_APLHA_PART2 vs32, vs40, vs0, vs1 | |||
| MULT_APLHA_PART2 vs33, vs41, vs2, vs3 | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxperm vs0, vs1, save_permute_1 | |||
| xxperm vs2, vs3, save_permute_1 | |||
| #else | |||
| xxperm vs0, vs1, vs28 | |||
| xxperm vs2, vs3, vs28 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| xvaddsp vs24, vs24, vs2 | |||
| @@ -1784,8 +2077,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stxvp vs24, 0(CO) | |||
| #else | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| stxv vs2, 0(CO) | |||
| stxv vs0, 16(CO) | |||
| #else | |||
| stxv vs0, 0(CO) | |||
| stxv vs2, 16(CO) | |||
| #endif | |||
| #endif | |||
| addi CO, CO, 32 | |||
| .endm | |||
| @@ -1904,7 +2202,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MULT_APLHA_PART1 vs32, vs40, vs0, vs1 | |||
| MULT_APLHA_PART2 vs32, vs40, vs0, vs1 | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxperm vs0, vs1, save_permute_1 | |||
| #else | |||
| xxperm vs0, vs1, vs28 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| xvaddsp vs24, vs24, vs0 | |||
| @@ -2018,7 +2320,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MULT_APLHA_PART1 vs32, vs40, vs37, vs1 | |||
| MULT_APLHA_PART2 vs32, vs40, vs37, vs1 | |||
| /* reconstruct r, i pairs*/ | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxperm vs37, vs1, save_permute_1 | |||
| #else | |||
| xxperm vs37, vs1, vs28 | |||
| #endif | |||
| #ifndef TRMMKERNEL | |||
| /* add */ | |||
| xvaddsp vs36, vs36, vs37 | |||
| @@ -30,7 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i) | |||
| { | |||
| __vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i}; | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| __vector unsigned char mask = {4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; | |||
| #else | |||
| __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; | |||
| #endif | |||
| __asm__ | |||
| ( | |||
| "dcbt 0, %2 \n\t" | |||
| @@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "cswap_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "cswap_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "cswap_microk_power8.c" | |||
| #include "cswap_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -49,14 +49,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "dasum_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "dasum_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "dasum_microk_power8.c" | |||
| #include "dasum_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1) | |||
| @@ -114,7 +111,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if ( inc_x == 1 ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 32) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| @@ -40,18 +40,27 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y | |||
| XXSPLTD_S(32,%x9,0) // alpha, alpha | |||
| "sldi %6, %13, 3 \n\t" // lda * sizeof (double) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmuldp 34, 40, 32 \n\t" // x0 * alpha, x1 * alpha | |||
| "xvmuldp 35, 41, 32 \n\t" // x2 * alpha, x3 * alpha | |||
| #else | |||
| "xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha | |||
| "xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha | |||
| #endif | |||
| "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda | |||
| "add %6, %6, %6 \n\t" // 2 * lda | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha | |||
| XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha | |||
| XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha | |||
| XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha | |||
| #else | |||
| XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha | |||
| XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha | |||
| XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha | |||
| XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha | |||
| #endif | |||
| "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda | |||
| "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda | |||
| @@ -286,6 +295,16 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
| "add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda | |||
| "add %10, %10, %10 \n\t" // 2 * lda | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha | |||
| XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha | |||
| XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha | |||
| XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha | |||
| XXSPLTD_S(48,39,0) // x6 * alpha, x6 * alpha | |||
| XXSPLTD_S(49,39,1) // x7 * alpha, x7 * alpha | |||
| XXSPLTD_S(39,38,1) // x5 * alpha, x5 * alpha | |||
| XXSPLTD_S(38,38,0) // x4 * alpha, x4 * alpha | |||
| #else | |||
| XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha | |||
| XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha | |||
| XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha | |||
| @@ -294,6 +313,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
| XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha | |||
| XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha | |||
| XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha | |||
| #endif | |||
| "add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda | |||
| "add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda | |||
| @@ -319,30 +339,69 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
| "one%=: \n\t" | |||
| "lxvp 36, 0( %2) \n\t" // y0, y1 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 40, 32 \n\t" | |||
| "xvmaddadp 37, 41, 32 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 40, 34 \n\t" | |||
| "xvmaddadp 37, 41, 34 \n\t" | |||
| #endif | |||
| "lxvpx 40, %3, %11 \n\t" // a0[0], a0[1] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 42, 33 \n\t" | |||
| "xvmaddadp 37, 43, 33 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 42, 35 \n\t" | |||
| "xvmaddadp 37, 43, 35 \n\t" | |||
| #endif | |||
| "lxvpx 42, %4, %11 \n\t" // a1[0], a1[1] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 44, 34 \n\t" | |||
| "xvmaddadp 37, 45, 34 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 44, 32 \n\t" | |||
| "xvmaddadp 37, 45, 32 \n\t" | |||
| #endif | |||
| "lxvpx 44, %5, %11 \n\t" // a2[0], a2[1] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 46, 35 \n\t" | |||
| "xvmaddadp 37, 47, 35 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 46, 33 \n\t" | |||
| "xvmaddadp 37, 47, 33 \n\t" | |||
| #endif | |||
| "lxvpx 46, %6, %11 \n\t" // a3[0], a3[1] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 50, 38 \n\t" | |||
| "xvmaddadp 37, 51, 38 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 50, 48 \n\t" | |||
| "xvmaddadp 37, 51, 48 \n\t" | |||
| #endif | |||
| "lxvpx 50, %7, %11 \n\t" // a4[0] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 52, 39 \n\t" | |||
| "xvmaddadp 37, 53, 39 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 52, 49 \n\t" | |||
| "xvmaddadp 37, 53, 49 \n\t" | |||
| #endif | |||
| "lxvpx 52, %8, %11 \n\t" // a5[0] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 54, 48 \n\t" | |||
| "xvmaddadp 37, 55, 48 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 54, 38 \n\t" | |||
| "xvmaddadp 37, 55, 38 \n\t" | |||
| #endif | |||
| "lxvpx 54, %9, %11 \n\t" // a6[0] | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 56, 49 \n\t" | |||
| "xvmaddadp 37, 57, 49 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 56, 39 \n\t" | |||
| "xvmaddadp 37, 57, 39 \n\t" | |||
| #endif | |||
| "lxvpx 56, %10, %11 \n\t" // a7[0] | |||
| "addi %11, %11, 32 \n\t" | |||
| @@ -355,6 +414,24 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
| "two%=: \n\t" | |||
| "lxvp 36, 0( %2) \n\t" // y0, y1 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 36, 40, 32 \n\t" | |||
| "xvmaddadp 37, 41, 32 \n\t" | |||
| "xvmaddadp 36, 42, 33 \n\t" | |||
| "xvmaddadp 37, 43, 33 \n\t" | |||
| "xvmaddadp 36, 44, 34 \n\t" | |||
| "xvmaddadp 37, 45, 34 \n\t" | |||
| "xvmaddadp 36, 46, 35 \n\t" | |||
| "xvmaddadp 37, 47, 35 \n\t" | |||
| "xvmaddadp 36, 50, 38 \n\t" | |||
| "xvmaddadp 37, 51, 38 \n\t" | |||
| "xvmaddadp 36, 52, 39 \n\t" | |||
| "xvmaddadp 37, 53, 39 \n\t" | |||
| "xvmaddadp 36, 54, 48 \n\t" | |||
| "xvmaddadp 37, 55, 48 \n\t" | |||
| "xvmaddadp 36, 56, 49 \n\t" | |||
| "xvmaddadp 37, 57, 49 \n\t" | |||
| #else | |||
| "xvmaddadp 36, 40, 34 \n\t" | |||
| "xvmaddadp 37, 41, 34 \n\t" | |||
| "xvmaddadp 36, 42, 35 \n\t" | |||
| @@ -371,6 +448,7 @@ static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y | |||
| "xvmaddadp 37, 55, 38 \n\t" | |||
| "xvmaddadp 36, 56, 39 \n\t" | |||
| "xvmaddadp 37, 57, 39 \n\t" | |||
| #endif | |||
| "stxvp 36, 0( %2) \n\t" // y0, y1 | |||
| : | |||
| @@ -279,34 +279,58 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do | |||
| "lxvp 40, 32(%[y]) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXMRGHD_S(42,34,35) | |||
| XXMRGLD_S(43,34,35) | |||
| XXMRGHD_S(44,4,5) | |||
| XXMRGLD_S(45,4,5) | |||
| #else | |||
| XXMRGLD_S(42,35,34) | |||
| XXMRGHD_S(43,35,34) | |||
| XXMRGLD_S(44,5,4) | |||
| XXMRGHD_S(45,5,4) | |||
| #endif | |||
| "xvadddp 42,42,43 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXMRGHD_S(46,6,7) | |||
| XXMRGLD_S(47,6,7) | |||
| #else | |||
| XXMRGLD_S(46,7,6) | |||
| XXMRGHD_S(47,7,6) | |||
| #endif | |||
| "xvadddp 44,44,45 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXMRGHD_S(48,8,9) | |||
| XXMRGLD_S(49,8,9) | |||
| #else | |||
| XXMRGLD_S(48,9,8) | |||
| XXMRGHD_S(49,9,8) | |||
| #endif | |||
| "xvadddp 46,46,47 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 38,42,36 \n\t" | |||
| "xvmaddadp 39,44,36 \n\t" | |||
| #else | |||
| "xvmaddadp 39,42,36 \n\t" | |||
| "xvmaddadp 38,44,36 \n\t" | |||
| #endif | |||
| "xvadddp 48,48,49 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 41,48,36 \n\t" | |||
| #else | |||
| "xvmaddadp 41,46,36 \n\t" | |||
| #endif | |||
| "stxvp 38, 0(%[y]) \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "xvmaddadp 40,46,36 \n\t" | |||
| #else | |||
| "xvmaddadp 40,48,36 \n\t" | |||
| #endif | |||
| "stxvp 40, 32(%[y]) \n\t" | |||
| : [memy] "+m" (*(double (*)[8])y), | |||
| @@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "drot_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "drot_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "drot_microk_power8.c" | |||
| #include "drot_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -110,8 +108,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| FLOAT temp; | |||
| if ( n <= 0 ) return(0); | |||
| @@ -119,7 +115,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| @@ -139,7 +135,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| drot_kernel_16(n1, x1, y1, c, s); | |||
| drot_kernel_16(n1, x, y, c, s); | |||
| i=n1; | |||
| } | |||
| #endif | |||
| @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "dscal_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "dscal_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "dscal_microk_power8.c" | |||
| #include "dscal_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -104,7 +102,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( da == 0.0 ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| @@ -138,7 +136,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| else | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "dswap_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "swap_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "dswap_microk_power8.c" | |||
| #include "swap_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; | |||
| @@ -330,10 +330,10 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -32; | |||
| #if defined(_CALL_ELF) && (_CALL_ELF == 2) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| max = diamax_kernel_32(n1, x, &maxf); | |||
| @@ -49,10 +49,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "sasum_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "sasum_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "sasum_microk_power8.c" | |||
| #include "sasum_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -114,7 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| if ( inc_x == 1 ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| @@ -42,10 +42,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "srot_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "srot_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "srot_microk_power8.c" | |||
| #include "srot_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 16 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "sscal_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "sscal_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "sscal_microk_power8.c" | |||
| #include "sscal_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -106,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| if ( da == 0.0 ) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| @@ -140,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS | |||
| else | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 32 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| @@ -38,10 +38,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "sswap_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #include "swap_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "sswap_microk_power8.c" | |||
| #include "swap_microk_power10.c" | |||
| #endif | |||
| #endif | |||
| @@ -119,7 +117,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| if ( n >= 64 ) | |||
| { | |||
| BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; | |||
| @@ -389,7 +389,6 @@ static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, | |||
| vector FLOAT *Vc6 = (vector FLOAT *) c6; | |||
| vector FLOAT *Vc7 = (vector FLOAT *) c7; | |||
| vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; | |||
| int j; | |||
| b[120] = (c0[15] *= a[255]); | |||
| b[121] = (c1[15] *= a[255]); | |||
| @@ -390,7 +390,6 @@ static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, | |||
| vector FLOAT *Vc6 = (vector FLOAT *) c6; | |||
| vector FLOAT *Vc7 = (vector FLOAT *) c7; | |||
| vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; | |||
| int j; | |||
| b[0] = (c0[0] *= a[0]); | |||
| b[1] = (c1[0] *= a[0]); | |||
| @@ -30,9 +30,17 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, | |||
| double alpha_r, double alpha_i) | |||
| { | |||
| #if !defined(CONJ) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| static const double mvec[2] = { -1.0, 1.0 }; | |||
| #else | |||
| static const double mvec[2] = { 1.0, -1.0 }; | |||
| #endif | |||
| #else | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| static const double mvec[2] = { 1.0, -1.0 }; | |||
| #else | |||
| static const double mvec[2] = { -1.0, 1.0 }; | |||
| #endif | |||
| #endif | |||
| const double *mvecp = mvec; | |||
| @@ -147,13 +147,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| std r0, FLINK_SAVE(SP) | |||
| #if defined(linux) || defined(__FreeBSD__) | |||
| #if defined(linux) || defined(__FreeBSD__) || defined(_AIX) | |||
| ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #ifdef TRMMKERNEL | |||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||
| #if (defined(linux) || defined(__FreeBSD__) || defined(_AIX)) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| @@ -41,23 +41,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifndef TRMMKERNEL | |||
| lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) | |||
| lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxmrghd \VS_OUT1,\VS_TEMP1,\VS_TEMP2 | |||
| xxmrgld \VS_OUT2,\VS_TEMP1,\VS_TEMP2 | |||
| #else | |||
| xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 | |||
| xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 | |||
| #endif | |||
| #endif | |||
| .endm | |||
| /*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ | |||
| .macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ | |||
| xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ | |||
| #else | |||
| xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ | |||
| xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ | |||
| #endif | |||
| .endm | |||
| /*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ | |||
| .macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ | |||
| xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ | |||
| #else | |||
| xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ | |||
| xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ | |||
| #endif | |||
| .endm | |||
| /* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ | |||
| @@ -103,8 +118,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxmrghd \VSOUT1,\VSIN1,\VSIN2 | |||
| xxmrgld \VSOUT2,\VSIN1,\VSIN2 | |||
| #else | |||
| xxmrghd \VSOUT1,\VSIN2,\VSIN1 | |||
| xxmrgld \VSOUT2,\VSIN2,\VSIN1 | |||
| #endif | |||
| .endm | |||
| @@ -186,15 +206,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35 | |||
| #ifndef TRMMKERNEL | |||
| lxv vs50, (\LOFFSET)(\BASE_REG) | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxmrghd vs46,vs50,vs50 | |||
| xxmrgld vs47,vs50,vs50 | |||
| #else | |||
| xxmrgld vs46,vs50,vs50 | |||
| xxmrghd vs47,vs50,vs50 | |||
| #endif | |||
| #endif | |||
| RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37 | |||
| AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37 | |||
| MULT_APLHA_PART1 vs34,vs36, vs46,vs47 | |||
| MULT_APLHA_PART2 vs34,vs36, vs46,vs47 | |||
| UNPACK_FOR_STORE vs46,vs47,vs39,vs41 | |||
| #if (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| xxmrghd vs39,vs47,vs46 | |||
| #endif | |||
| stxv vs39, (\LOFFSET)(\BASE_REG) | |||
| .endm | |||
| @@ -232,6 +259,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A | |||
| lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A | |||
| lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs34, vs48 | |||
| xvf64gerpp 2, vs36, vs48 | |||
| xvf64gerpp 3, vs38, vs48 | |||
| xvf64gerpp 4, vs32, vs49 | |||
| xvf64gerpp 5, vs34, vs49 | |||
| xvf64gerpp 6, vs36, vs49 | |||
| xvf64gerpp 7, vs38, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs36, vs49 | |||
| @@ -240,11 +277,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf64gerpp 5, vs34, vs48 | |||
| xvf64gerpp 6, vs36, vs48 | |||
| xvf64gerpp 7, vs38, vs48 | |||
| #endif | |||
| lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A | |||
| lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A | |||
| lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A | |||
| lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A | |||
| lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs40, vs50 | |||
| xvf64gerpp 1, vs42, vs50 | |||
| xvf64gerpp 2, vs44, vs50 | |||
| xvf64gerpp 3, vs46, vs50 | |||
| xvf64gerpp 4, vs40, vs51 | |||
| xvf64gerpp 5, vs42, vs51 | |||
| xvf64gerpp 6, vs44, vs51 | |||
| xvf64gerpp 7, vs46, vs51 | |||
| #else | |||
| xvf64gerpp 0, vs40, vs51 | |||
| xvf64gerpp 1, vs42, vs51 | |||
| xvf64gerpp 2, vs44, vs51 | |||
| @@ -253,6 +301,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf64gerpp 5, vs42, vs50 | |||
| xvf64gerpp 6, vs44, vs50 | |||
| xvf64gerpp 7, vs46, vs50 | |||
| #endif | |||
| .if \IsLast==1 | |||
| addi AO, AO, DISP16(\Index,256) | |||
| addi BO, BO, DISP4(\Index,64) | |||
| @@ -261,6 +310,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD_END_2x8 OffsetA,OffsetB | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs34, vs48 | |||
| xvf64gerpp 2, vs36, vs48 | |||
| xvf64gerpp 3, vs38, vs48 | |||
| xvf64gerpp 4, vs32, vs49 | |||
| xvf64gerpp 5, vs34, vs49 | |||
| xvf64gerpp 6, vs36, vs49 | |||
| xvf64gerpp 7, vs38, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs36, vs49 | |||
| @@ -269,6 +328,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvf64gerpp 5, vs34, vs48 | |||
| xvf64gerpp 6, vs36, vs48 | |||
| xvf64gerpp 7, vs38, vs48 | |||
| #endif | |||
| addi BO, BO, \OffsetB | |||
| addi AO, AO, \OffsetA | |||
| .endm | |||
| @@ -305,7 +365,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs45, vs12, vs13, 0b10 | |||
| xxpermdi vs46, vs14, vs15, 0b01 | |||
| xxpermdi vs47, vs14, vs15, 0b10 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxlor vs0, vs32, vs32 | |||
| xxlor vs1, vs33, vs33 | |||
| xxlor vs2, vs34, vs34 | |||
| xxlor vs3, vs35, vs35 | |||
| xxlor vs4, vs36, vs36 | |||
| xxlor vs5, vs37, vs37 | |||
| xxlor vs6, vs38, vs38 | |||
| xxlor vs7, vs39, vs39 | |||
| xxlor vs8, vs40, vs40 | |||
| xxlor vs9, vs41, vs41 | |||
| xxlor vs10, vs42, vs42 | |||
| xxlor vs11, vs43, vs43 | |||
| xxlor vs12, vs44, vs44 | |||
| xxlor vs13, vs45, vs45 | |||
| xxlor vs14, vs46, vs46 | |||
| xxlor vs15, vs47, vs47 | |||
| #else | |||
| xxlor vs2, vs32, vs32 | |||
| xxlor vs3, vs33, vs33 | |||
| xxlor vs0, vs34, vs34 | |||
| @@ -322,7 +399,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxlor vs15, vs45, vs45 | |||
| xxlor vs12, vs46, vs46 | |||
| xxlor vs13, vs47, vs47 | |||
| #endif | |||
| xxpermdi vs32, vs16, vs17, 0b01 | |||
| xxpermdi vs33, vs16, vs17, 0b10 | |||
| xxpermdi vs34, vs18, vs19, 0b01 | |||
| @@ -339,7 +416,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs45, vs28, vs29, 0b10 | |||
| xxpermdi vs46, vs30, vs31, 0b01 | |||
| xxpermdi vs47, vs30, vs31, 0b10 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxlor vs16, vs32, vs32 | |||
| xxlor vs17, vs33, vs33 | |||
| xxlor vs18, vs34, vs34 | |||
| xxlor vs19, vs35, vs35 | |||
| xxlor vs20, vs36, vs36 | |||
| xxlor vs21, vs37, vs37 | |||
| xxlor vs22, vs38, vs38 | |||
| xxlor vs23, vs39, vs39 | |||
| xxlor vs24, vs40, vs40 | |||
| xxlor vs25, vs41, vs41 | |||
| xxlor vs26, vs42, vs42 | |||
| xxlor vs27, vs43, vs43 | |||
| xxlor vs28, vs44, vs44 | |||
| xxlor vs29, vs45, vs45 | |||
| xxlor vs30, vs46, vs46 | |||
| xxlor vs31, vs47, vs47 | |||
| #else | |||
| xxlor vs18, vs32, vs32 | |||
| xxlor vs19, vs33, vs33 | |||
| xxlor vs16, vs34, vs34 | |||
| @@ -356,7 +450,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxlor vs31, vs45, vs45 | |||
| xxlor vs28, vs46, vs46 | |||
| xxlor vs29, vs47, vs47 | |||
| #endif | |||
| SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 | |||
| SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0 | |||
| addi CO, CO, 128 | |||
| @@ -388,17 +482,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A | |||
| lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A | |||
| lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs32, vs48 | |||
| xvf64gerpp 3, vs34, vs48 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs34, vs48 | |||
| xvf64gerpp 2, vs32, vs49 | |||
| xvf64gerpp 3, vs34, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs32, vs48 | |||
| xvf64gerpp 3, vs34, vs48 | |||
| #endif | |||
| lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A | |||
| lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A | |||
| lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B | |||
| xvf64gerpp 0, vs40, vs51 | |||
| xvf64gerpp 1, vs42, vs51 | |||
| xvf64gerpp 2, vs40, vs50 | |||
| xvf64gerpp 3, vs42, vs50 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs40, vs50 | |||
| xvf64gerpp 1, vs42, vs50 | |||
| xvf64gerpp 2, vs40, vs51 | |||
| xvf64gerpp 3, vs42, vs51 | |||
| #else | |||
| xvf64gerpp 0, vs40, vs51 | |||
| xvf64gerpp 1, vs42, vs51 | |||
| xvf64gerpp 2, vs40, vs50 | |||
| xvf64gerpp 3, vs42, vs50 | |||
| #endif | |||
| .if \IsLast==1 | |||
| addi AO, AO, DISP8(\Index,128) | |||
| addi BO, BO, DISP4(\Index,64) | |||
| @@ -407,10 +515,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD_END_2x4 OffsetA, OffsetB | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs32, vs48 | |||
| xvf64gerpp 3, vs34, vs48 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs34, vs48 | |||
| xvf64gerpp 2, vs32, vs49 | |||
| xvf64gerpp 3, vs34, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs32, vs48 | |||
| xvf64gerpp 3, vs34, vs48 | |||
| #endif | |||
| addi BO, BO, \OffsetB | |||
| addi AO, AO, \OffsetA | |||
| .endm | |||
| @@ -443,7 +558,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs45, vs12, vs13, 0b10 | |||
| xxpermdi vs46, vs14, vs15, 0b01 | |||
| xxpermdi vs47, vs14, vs15, 0b10 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxlor vs0, vs32, vs32 | |||
| xxlor vs1, vs33, vs33 | |||
| xxlor vs2, vs34, vs34 | |||
| xxlor vs3, vs35, vs35 | |||
| xxlor vs4, vs36, vs36 | |||
| xxlor vs5, vs37, vs37 | |||
| xxlor vs6, vs38, vs38 | |||
| xxlor vs7, vs39, vs39 | |||
| xxlor vs8, vs40, vs40 | |||
| xxlor vs9, vs41, vs41 | |||
| xxlor vs10, vs42, vs42 | |||
| xxlor vs11, vs43, vs43 | |||
| xxlor vs12, vs44, vs44 | |||
| xxlor vs13, vs45, vs45 | |||
| xxlor vs14, vs46, vs46 | |||
| xxlor vs15, vs47, vs47 | |||
| #else | |||
| xxlor vs2, vs32, vs32 | |||
| xxlor vs3, vs33, vs33 | |||
| xxlor vs0, vs34, vs34 | |||
| @@ -460,7 +592,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxlor vs15, vs45, vs45 | |||
| xxlor vs12, vs46, vs46 | |||
| xxlor vs13, vs47, vs47 | |||
| #endif | |||
| SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 | |||
| SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0 | |||
| addi CO, CO, 64 | |||
| @@ -488,12 +620,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL2x2_2 Index, IsLast | |||
| lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A | |||
| lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs32, vs48 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs32, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs32, vs48 | |||
| #endif | |||
| lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A | |||
| lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B | |||
| xvf64gerpp 0, vs40, vs51 | |||
| xvf64gerpp 1, vs40, vs50 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs40, vs50 | |||
| xvf64gerpp 1, vs40, vs51 | |||
| #else | |||
| xvf64gerpp 0, vs40, vs51 | |||
| xvf64gerpp 1, vs40, vs50 | |||
| #endif | |||
| .if \IsLast==1 | |||
| addi AO, AO, DISP4(\Index,64) | |||
| addi BO, BO, DISP4(\Index,64) | |||
| @@ -502,8 +644,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD_END_2x2 OffsetA,OffsetB | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs32, vs48 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs32, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs32, vs48 | |||
| #endif | |||
| addi BO, BO, \OffsetB | |||
| addi AO, AO, \OffsetA | |||
| .endm | |||
| @@ -526,7 +673,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs37, vs4, vs5, 0b10 | |||
| xxpermdi vs38, vs6, vs7, 0b01 | |||
| xxpermdi vs39, vs6, vs7, 0b10 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxlor vs0, vs32, vs32 | |||
| xxlor vs1, vs33, vs33 | |||
| xxlor vs2, vs34, vs34 | |||
| xxlor vs3, vs35, vs35 | |||
| xxlor vs4, vs36, vs36 | |||
| xxlor vs5, vs37, vs37 | |||
| xxlor vs6, vs38, vs38 | |||
| xxlor vs7, vs39, vs39 | |||
| #else | |||
| xxlor vs2, vs32, vs32 | |||
| xxlor vs3, vs33, vs33 | |||
| xxlor vs0, vs34, vs34 | |||
| @@ -535,7 +691,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxlor vs7, vs37, vs37 | |||
| xxlor vs4, vs38, vs38 | |||
| xxlor vs5, vs39, vs39 | |||
| #endif | |||
| SAVE2 vs0,vs1,vs2,vs3,CO,0 | |||
| SAVE2 vs4,vs5,vs6,vs7,T1,0 | |||
| addi CO, CO, 32 | |||
| @@ -702,14 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A | |||
| lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A | |||
| lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs36, vs49 | |||
| xvf64gerpp 3, vs38, vs49 | |||
| xvf64gerpp 0, vs40, vs48 | |||
| xvf64gerpp 1, vs42, vs48 | |||
| xvf64gerpp 2, vs44, vs48 | |||
| xvf64gerpp 3, vs46, vs48 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs34, vs48 | |||
| xvf64gerpp 2, vs36, vs48 | |||
| xvf64gerpp 3, vs38, vs48 | |||
| xvf64gerpp 0, vs40, vs49 | |||
| xvf64gerpp 1, vs42, vs49 | |||
| xvf64gerpp 2, vs44, vs49 | |||
| xvf64gerpp 3, vs46, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 2, vs36, vs49 | |||
| xvf64gerpp 3, vs38, vs49 | |||
| xvf64gerpp 0, vs40, vs48 | |||
| xvf64gerpp 1, vs42, vs48 | |||
| xvf64gerpp 2, vs44, vs48 | |||
| xvf64gerpp 3, vs46, vs48 | |||
| #endif | |||
| .if \IsLast==1 | |||
| addi AO, AO, DISP16(\Index,256) | |||
| addi BO, BO, DISP2(\Index,32) | |||
| @@ -758,7 +925,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs45, vs12, vs13, 0b10 | |||
| xxpermdi vs46, vs14, vs15, 0b01 | |||
| xxpermdi vs47, vs14, vs15, 0b10 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxlor vs0, vs32, vs32 | |||
| xxlor vs1, vs33, vs33 | |||
| xxlor vs2, vs34, vs34 | |||
| xxlor vs3, vs35, vs35 | |||
| xxlor vs4, vs36, vs36 | |||
| xxlor vs5, vs37, vs37 | |||
| xxlor vs6, vs38, vs38 | |||
| xxlor vs7, vs39, vs39 | |||
| xxlor vs8, vs40, vs40 | |||
| xxlor vs9, vs41, vs41 | |||
| xxlor vs10, vs42, vs42 | |||
| xxlor vs11, vs43, vs43 | |||
| xxlor vs12, vs44, vs44 | |||
| xxlor vs13, vs45, vs45 | |||
| xxlor vs14, vs46, vs46 | |||
| xxlor vs15, vs47, vs47 | |||
| #else | |||
| xxlor vs2, vs32, vs32 | |||
| xxlor vs3, vs33, vs33 | |||
| xxlor vs0, vs34, vs34 | |||
| @@ -775,7 +959,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxlor vs15, vs45, vs45 | |||
| xxlor vs12, vs46, vs46 | |||
| xxlor vs13, vs47, vs47 | |||
| #endif | |||
| SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0 | |||
| addi CO, CO, 128 | |||
| .endm | |||
| @@ -799,10 +983,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A | |||
| lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A | |||
| lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 0, vs40, vs48 | |||
| xvf64gerpp 1, vs42, vs48 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 1, vs34, vs48 | |||
| xvf64gerpp 0, vs40, vs49 | |||
| xvf64gerpp 1, vs42, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 1, vs34, vs49 | |||
| xvf64gerpp 0, vs40, vs48 | |||
| xvf64gerpp 1, vs42, vs48 | |||
| #endif | |||
| .if \IsLast==1 | |||
| addi AO, AO, DISP8(\Index,128) | |||
| addi BO, BO, DISP2(\Index,32) | |||
| @@ -837,7 +1028,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs37, vs4, vs5, 0b10 | |||
| xxpermdi vs38, vs6, vs7, 0b01 | |||
| xxpermdi vs39, vs6, vs7, 0b10 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxlor vs0, vs32, vs32 | |||
| xxlor vs1, vs33, vs33 | |||
| xxlor vs2, vs34, vs34 | |||
| xxlor vs3, vs35, vs35 | |||
| xxlor vs4, vs36, vs36 | |||
| xxlor vs5, vs37, vs37 | |||
| xxlor vs6, vs38, vs38 | |||
| xxlor vs7, vs39, vs39 | |||
| #else | |||
| xxlor vs2, vs32, vs32 | |||
| xxlor vs3, vs33, vs33 | |||
| xxlor vs0, vs34, vs34 | |||
| @@ -846,7 +1046,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxlor vs7, vs37, vs37 | |||
| xxlor vs4, vs38, vs38 | |||
| xxlor vs5, vs39, vs39 | |||
| #endif | |||
| SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0 | |||
| addi CO, CO, 64 | |||
| .endm | |||
| @@ -867,8 +1067,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A | |||
| lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A | |||
| lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 0, vs40, vs48 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xvf64gerpp 0, vs32, vs48 | |||
| xvf64gerpp 0, vs40, vs49 | |||
| #else | |||
| xvf64gerpp 0, vs32, vs49 | |||
| xvf64gerpp 0, vs40, vs48 | |||
| #endif | |||
| .if \IsLast==1 | |||
| addi AO, AO, DISP4(\Index,64) | |||
| addi BO, BO, DISP2(\Index,32) | |||
| @@ -896,11 +1101,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xxpermdi vs33, vs0, vs1, 0b10 | |||
| xxpermdi vs34, vs2, vs3, 0b01 | |||
| xxpermdi vs35, vs2, vs3, 0b10 | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| xxlor vs0, vs32, vs32 | |||
| xxlor vs1, vs33, vs33 | |||
| xxlor vs2, vs34, vs34 | |||
| xxlor vs3, vs35, vs35 | |||
| #else | |||
| xxlor vs2, vs32, vs32 | |||
| xxlor vs3, vs33, vs33 | |||
| xxlor vs0, vs34, vs34 | |||
| xxlor vs1, vs35, vs35 | |||
| #endif | |||
| SAVE2 vs0,vs1,vs2,vs3,CO,0 | |||
| addi CO, CO, 32 | |||
| @@ -607,7 +607,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| @@ -738,7 +738,6 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| @@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #elif HAVE_KERNEL_4x4_VEC | |||
| #if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #if defined(POWER10) | |||
| typedef __vector unsigned char vec_t; | |||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | |||
| @@ -43,16 +43,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(DOUBLE) | |||
| #include "zscal_microk_power8.c" | |||
| #endif | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #elif defined(POWER10) | |||
| #if defined(DOUBLE) | |||
| #include "zscal_microk_power10.c" | |||
| #else | |||
| #include "cscal_microk_power10.c" | |||
| #endif | |||
| #elif defined(POWER10) | |||
| #if defined(DOUBLE) | |||
| #include "zscal_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #endif | |||
| @@ -42,7 +42,11 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) | |||
| "xsnegdp 33, %x10 \n\t" // -alpha_i | |||
| XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| XXMRGHD_S(33,33, %x10) // -alpha_i , alpha_i | |||
| #else | |||
| XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i | |||
| #endif | |||
| "lxvp 40, 0(%2) \n\t" | |||
| "lxvp 42, 32(%2) \n\t" | |||
| @@ -97,10 +101,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) | |||
| "xvadddp 49, 49, 39 \n\t" | |||
| "xvadddp 50, 50, %x3 \n\t" | |||
| "xvadddp 51, 51, %x4 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 48, 0(%2) \n\t" | |||
| "stxv 49, 16(%2) \n\t" | |||
| "stxv 50, 32(%2) \n\t" | |||
| "stxv 51, 48(%2) \n\t" | |||
| #else | |||
| "stxv 49, 0(%2) \n\t" | |||
| "stxv 48, 16(%2) \n\t" | |||
| "stxv 51, 32(%2) \n\t" | |||
| "stxv 50, 48(%2) \n\t" | |||
| #endif | |||
| "xvadddp 34, 34, %x5 \n\t" | |||
| @@ -109,12 +120,17 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) | |||
| "xvadddp 36, 36, %x7 \n\t" | |||
| "xvadddp 37, 37, %x8 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 34, 64(%2) \n\t" | |||
| "stxv 35, 80(%2) \n\t" | |||
| "stxv 36, 96(%2) \n\t" | |||
| "stxv 37, 112(%2) \n\t" | |||
| #else | |||
| "stxv 35, 64(%2) \n\t" | |||
| "stxv 34, 80(%2) \n\t" | |||
| "stxv 37, 96(%2) \n\t" | |||
| "stxv 36, 112(%2) \n\t" | |||
| #endif | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %1, %1, -8 \n\t" | |||
| @@ -155,23 +171,34 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) | |||
| "xvadddp 50, 50, %x3 \n\t" | |||
| "xvadddp 51, 51, %x4 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 48, 0(%2) \n\t" | |||
| "stxv 49, 16(%2) \n\t" | |||
| "stxv 50, 32(%2) \n\t" | |||
| "stxv 51, 48(%2) \n\t" | |||
| #else | |||
| "stxv 49, 0(%2) \n\t" | |||
| "stxv 48, 16(%2) \n\t" | |||
| "stxv 51, 32(%2) \n\t" | |||
| "stxv 50, 48(%2) \n\t" | |||
| #endif | |||
| "xvadddp 34, 34, %x5 \n\t" | |||
| "xvadddp 35, 35, %x6 \n\t" | |||
| "xvadddp 36, 36, %x7 \n\t" | |||
| "xvadddp 37, 37, %x8 \n\t" | |||
| #if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| "stxv 34, 64(%2) \n\t" | |||
| "stxv 35, 80(%2) \n\t" | |||
| "stxv 36, 96(%2) \n\t" | |||
| "stxv 37, 112(%2) \n\t" | |||
| #else | |||
| "stxv 35, 64(%2) \n\t" | |||
| "stxv 34, 80(%2) \n\t" | |||
| "stxv 37, 96(%2) \n\t" | |||
| "stxv 36, 112(%2) \n\t" | |||
| #endif | |||
| "#n=%1 x=%0=%2 alpha=(%9,%10) \n" | |||
| : | |||
| "+m" (*x), | |||
| @@ -39,10 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "zswap_microk_power8.c" | |||
| #elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) | |||
| #elif defined(POWER10) | |||
| #include "cswap_microk_power10.c" | |||
| #elif defined(POWER10) | |||
| #include "zswap_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| @@ -9,3 +9,14 @@ SBGEMM_SMALL_K_TN = sbgemm_small_kernel_tn_cooperlake.c | |||
| SBGEMM_SMALL_K_B0_TN = sbgemm_small_kernel_tn_cooperlake.c | |||
| SBGEMM_SMALL_K_TT = sbgemm_small_kernel_tt_cooperlake.c | |||
| SBGEMM_SMALL_K_B0_TT = sbgemm_small_kernel_tt_cooperlake.c | |||
| SBGEMM_BETA = sgemm_beta_skylakex.c | |||
| SBGEMMKERNEL = sbgemm_kernel_16x4_cooperlake.c | |||
| SBGEMMINCOPY = sbgemm_ncopy_16_cooperlake.c | |||
| SBGEMMITCOPY = sbgemm_tcopy_16_cooperlake.c | |||
| SBGEMMONCOPY = sbgemm_ncopy_4_cooperlake.c | |||
| SBGEMMOTCOPY = sbgemm_tcopy_4_cooperlake.c | |||
| SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -56,25 +56,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n) \ | |||
| regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]); \ | |||
| regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]); \ | |||
| regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]); \ | |||
| regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]); \ | |||
| regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]); \ | |||
| regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]); \ | |||
| regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]); \ | |||
| regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]); | |||
| regArray##_0 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+0)*lda + idx_n])); \ | |||
| regArray##_1 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+1)*lda + idx_n])); \ | |||
| regArray##_2 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+2)*lda + idx_n])); \ | |||
| regArray##_3 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+3)*lda + idx_n])); \ | |||
| regArray##_4 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+4)*lda + idx_n])); \ | |||
| regArray##_5 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+5)*lda + idx_n])); \ | |||
| regArray##_6 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+6)*lda + idx_n])); \ | |||
| regArray##_7 = _mm256_loadu_si256((__m256i *)(&a[(idx_m+7)*lda + idx_n])); | |||
| #define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n) \ | |||
| regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]); \ | |||
| regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]); \ | |||
| regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]); \ | |||
| regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]); \ | |||
| regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]); \ | |||
| regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]); \ | |||
| regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]); \ | |||
| regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]); | |||
| regArray##_0 = _mm_loadu_si128((__m128i *)(&a[(idx_m+0)*lda + idx_n])); \ | |||
| regArray##_1 = _mm_loadu_si128((__m128i *)(&a[(idx_m+1)*lda + idx_n])); \ | |||
| regArray##_2 = _mm_loadu_si128((__m128i *)(&a[(idx_m+2)*lda + idx_n])); \ | |||
| regArray##_3 = _mm_loadu_si128((__m128i *)(&a[(idx_m+3)*lda + idx_n])); \ | |||
| regArray##_4 = _mm_loadu_si128((__m128i *)(&a[(idx_m+4)*lda + idx_n])); \ | |||
| regArray##_5 = _mm_loadu_si128((__m128i *)(&a[(idx_m+5)*lda + idx_n])); \ | |||
| regArray##_6 = _mm_loadu_si128((__m128i *)(&a[(idx_m+6)*lda + idx_n])); \ | |||
| regArray##_7 = _mm_loadu_si128((__m128i *)(&a[(idx_m+7)*lda + idx_n])); | |||
| #define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n) \ | |||
| @@ -153,11 +153,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define BF16_VECTOR_LOAD_1x16(reg, x, idx_n) \ | |||
| reg = _mm256_loadu_si256(x + idx_n); | |||
| reg = _mm256_loadu_si256((__m256i *)(x + idx_n)); | |||
| #define BF16_VECTOR_LOAD_1x8(reg, x, idx_n) \ | |||
| reg = _mm_loadu_si128(x + idx_n); | |||
| reg = _mm_loadu_si128((__m128i *)(x + idx_n)); | |||
| #define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask) \ | |||
| @@ -15,7 +15,7 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x) | |||
| if (n2 < 64) { | |||
| __m128 accum_10, accum_11, accum_12, accum_13; | |||
| __m128 abs_mask1; | |||
| __m128 abs_mask1 = abs_mask1; | |||
| accum_10 = _mm_setzero_ps(); | |||
| accum_11 = _mm_setzero_ps(); | |||
| @@ -38,10 +38,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) | |||
| __m256i abs_mask = _mm256_set1_epi64x(0x7fffffffffffffff); | |||
| for (i = 0; i < tail_index_AVX2; i += 16) { | |||
| accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); | |||
| accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 4]), abs_mask); | |||
| accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); | |||
| accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256(&x1[i+12]), abs_mask); | |||
| accum_0 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask); | |||
| accum_1 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 4]), abs_mask); | |||
| accum_2 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask); | |||
| accum_3 += (__m256d)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+12]), abs_mask); | |||
| } | |||
| accum_0 = accum_0 + accum_1 + accum_2 + accum_3; | |||
| @@ -63,10 +63,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) | |||
| __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); | |||
| for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { | |||
| accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); | |||
| accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); | |||
| accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); | |||
| accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); | |||
| accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); | |||
| accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2); | |||
| accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); | |||
| accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2); | |||
| } | |||
| accum_20 = accum_20 + accum_21 + accum_22 + accum_23; | |||
| @@ -58,10 +58,10 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) | |||
| __m128i abs_mask2 = _mm_set1_epi64x(0x7fffffffffffffff); | |||
| for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { | |||
| accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); | |||
| accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 2]), abs_mask2); | |||
| accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); | |||
| accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128(&x1[i + 6]), abs_mask2); | |||
| accum_20 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); | |||
| accum_21 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 2]), abs_mask2); | |||
| accum_22 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); | |||
| accum_23 += (__m128d)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 6]), abs_mask2); | |||
| } | |||
| accum_20 = accum_20 + accum_21 + accum_22 + accum_23; | |||
| @@ -38,10 +38,10 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) | |||
| __m256i abs_mask = _mm256_set1_epi32(0x7fffffff); | |||
| for (i = 0; i < tail_index_AVX2; i += 32) { | |||
| accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 0]), abs_mask); | |||
| accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+ 8]), abs_mask); | |||
| accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+16]), abs_mask); | |||
| accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256(&x1[i+24]), abs_mask); | |||
| accum_0 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 0]), abs_mask); | |||
| accum_1 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+ 8]), abs_mask); | |||
| accum_2 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+16]), abs_mask); | |||
| accum_3 += (__m256)_mm256_and_si256(_mm256_load_si256((__m256i*)&x1[i+24]), abs_mask); | |||
| } | |||
| accum_0 = accum_0 + accum_1 + accum_2 + accum_3; | |||
| @@ -62,8 +62,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) | |||
| __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); | |||
| for (i = tail_index_AVX2; i < tail_index_SSE; i += 8) { | |||
| accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); | |||
| accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); | |||
| accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); | |||
| accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); | |||
| } | |||
| accum_20 += accum_21; | |||
| @@ -53,8 +53,8 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) | |||
| __m128i abs_mask2 = _mm_set1_epi32(0x7fffffff); | |||
| for (i = tail_index_AVX512; i < tail_index_SSE; i += 8) { | |||
| accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 0]), abs_mask2); | |||
| accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128(&x1[i + 4]), abs_mask2); | |||
| accum_20 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 0]), abs_mask2); | |||
| accum_21 += (__m128)_mm_and_si128(_mm_loadu_si128((__m128i*)&x1[i + 4]), abs_mask2); | |||
| } | |||
| accum_20 += accum_21; | |||
| @@ -79,21 +79,21 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) | |||
| __m256 accum256_1 = _mm256_setzero_ps(); | |||
| int tail_index_32 = n&(~31); | |||
| for (int j = 0; j < tail_index_32; j += 32) { | |||
| accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[j+ 0]), (__m256bh) _mm256_loadu_si256(&y[j+ 0])); | |||
| accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256(&x[j+16]), (__m256bh) _mm256_loadu_si256(&y[j+16])); | |||
| accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+ 0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+ 0])); | |||
| accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256((__m256i *)&x[j+16]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[j+16])); | |||
| } | |||
| accum256 = _mm256_add_ps(accum256, accum256_1); | |||
| /* Processing the remaining <32 chunk with 16-elements processing */ | |||
| if ((n&16) != 0) { | |||
| accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[tail_index_32]), (__m256bh) _mm256_loadu_si256(&y[tail_index_32])); | |||
| accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[tail_index_32]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[tail_index_32])); | |||
| } | |||
| accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); | |||
| /* Processing the remaining <16 chunk with 8-elements processing */ | |||
| if ((n&8) != 0) { | |||
| int tail_index_16 = n&(~15); | |||
| accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16])); | |||
| accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16])); | |||
| } | |||
| /* Processing the remaining <8 chunk with masked 8-elements processing */ | |||
| @@ -108,13 +108,13 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) | |||
| } else if (n > 15) { /* n range from 16 to 31 */ | |||
| /* Processing <32 chunk with 16-elements processing */ | |||
| __m256 accum256 = _mm256_setzero_ps(); | |||
| accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[0]), (__m256bh) _mm256_loadu_si256(&y[0])); | |||
| accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256((__m256i *)&x[0]), (__m256bh) _mm256_loadu_si256((__m256i *)&y[0])); | |||
| accum128 += _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1)); | |||
| /* Processing the remaining <16 chunk with 8-elements processing */ | |||
| if ((n&8) != 0) { | |||
| int tail_index_16 = n&(~15); | |||
| accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16])); | |||
| accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[tail_index_16]), (__m128bh) _mm_loadu_si128((__m128i *)&y[tail_index_16])); | |||
| } | |||
| /* Processing the remaining <8 chunk with masked 8-elements processing */ | |||
| @@ -128,7 +128,7 @@ static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) | |||
| } | |||
| } else if (n > 7) { /* n range from 8 to 15 */ | |||
| /* Processing <16 chunk with 8-elements processing */ | |||
| accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[0]), (__m128bh) _mm_loadu_si128(&y[0])); | |||
| accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128((__m128i *)&x[0]), (__m128bh) _mm_loadu_si128((__m128i *)&y[0])); | |||
| /* Processing the remaining <8 chunk with masked 8-elements processing */ | |||
| if ((n&7) != 0) { | |||
| @@ -1246,7 +1246,7 @@ void COL_MAJOR_ITCOPY_KERNEL_Kx16(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat | |||
| // K=Any number but will be processed based on 32, M<=16 | |||
| void COL_MAJOR_ITCOPY_KERNEL_Kx16m(BLASLONG m, BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A) | |||
| { | |||
| bfloat16 * src_addr0, * src_addr1, * src_addr2, * src_addr3; | |||
| bfloat16 * src_addr0; | |||
| bfloat16 * dst_addr0, * dst_addr1; | |||
| BLASLONG tag_k_32x = k & (~31); | |||
| @@ -0,0 +1,499 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <immintrin.h> | |||
| #include "common.h" | |||
| #define VMOVLDUP(addr, zmm) asm("vmovsldup (%1), %0": "=v"(zmm): "r"(addr)) | |||
| #define VMOVHDUP(addr, zmm) asm("vmovshdup (%1), %0": "=v"(zmm): "r"(addr)) | |||
| #define BROADCAST64(base, step, n, offset, zmm) \ | |||
| if (n == 0) asm("vbroadcastsd %c2(%1), %0": "=v"(zmm): "r"(base), "n"(offset*2)); \ | |||
| else asm("vbroadcastsd %c4(%1, %2, %c3), %0": "=v"(zmm): "r"(base), "r"(step), "n"(n*2), "n"(offset*2)) | |||
| #define DECLARE_A_PAIR(A) \ | |||
| __m512i A_lo_##A; __m512i A_hi_##A; | |||
| #define LOAD_A_PAIR(A) \ | |||
| VMOVLDUP(ptr_a##A, A_lo_##A); \ | |||
| VMOVHDUP(ptr_a##A, A_hi_##A); | |||
| #define MASK_LOAD_A_PAIR(A) { \ | |||
| __m512 tmp = _mm512_maskz_loadu_ps(mmask, ptr_a##A); \ | |||
| A_lo_##A = (__m512i) _mm512_moveldup_ps(tmp); \ | |||
| A_hi_##A = (__m512i) _mm512_movehdup_ps(tmp); \ | |||
| } | |||
| #define LOAD_A_PAIR_TAIL(A) { \ | |||
| __m256i ymm = _mm256_loadu_si256((void *)ptr_a##A); \ | |||
| __m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \ | |||
| A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \ | |||
| A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \ | |||
| } | |||
| #define MASK_LOAD_A_PAIR_TAIL(A) { \ | |||
| __m256i ymm = _mm256_maskz_loadu_epi16(mmask, ptr_a##A); \ | |||
| __m512 zmm = (__m512) _mm512_cvtepu16_epi32(ymm); \ | |||
| A_lo_##A = (__m512i) _mm512_moveldup_ps(zmm); \ | |||
| A_hi_##A = (__m512i) _mm512_movehdup_ps(zmm); \ | |||
| } | |||
| #define DECLARE_B_PAIR() \ | |||
| __m512i B_lo; __m512i B_hi; | |||
| #define PREFETCH_B_STEP 32 | |||
| #define PREFETCH_B(Bx, By) \ | |||
| if (By == 0) asm("prefetcht0 %c1(%0)": : "r"(ptr_b##Bx), "n"(PREFETCH_B_STEP * 2)); \ | |||
| else asm("prefetcht0 %c3(%0, %1, %c2)": : "r"(ptr_b##Bx), "r"(n_blksize), "n"(By*2), "n"(PREFETCH_B_STEP * 2)) | |||
| #define BROADCAST_B_PAIR(Bx, By) \ | |||
| BROADCAST64(ptr_b##Bx, n_blksize, By, 0, B_lo); \ | |||
| BROADCAST64(ptr_b##Bx, n_blksize, By, 4, B_hi); | |||
| #define MASK_BROADCAST_B_PAIR(Bx, x) {\ | |||
| __m128 xmm = _mm_maskz_loadu_ps(nmask, ptr_b##Bx); \ | |||
| B_lo = (__m512i) _mm512_broadcastsd_pd((__m128d) xmm); \ | |||
| B_hi = (__m512i) _mm512_broadcastsd_pd(_mm_permute_pd((__m128d) xmm, 0x1)); \ | |||
| } | |||
| #define BROADCAST_B_PAIR_TAIL(Bx, By) {\ | |||
| __m128i xmm = (__m128i) _mm_load_sd((double *)(ptr_b##Bx + n_blksize * By)); \ | |||
| xmm = _mm_cvtepu16_epi32(xmm); \ | |||
| B_lo = _mm512_broadcast_i32x2(xmm); \ | |||
| B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \ | |||
| } | |||
| #define MASK_BROADCAST_B_PAIR_TAIL(Bx, By) {\ | |||
| __m128i xmm = _mm_maskz_loadu_epi16(nmask, ptr_b##Bx + n_blksize * By); \ | |||
| xmm = _mm_cvtepu16_epi32(xmm); \ | |||
| B_lo = _mm512_broadcast_i32x2(xmm); \ | |||
| B_hi = _mm512_broadcast_i32x2((__m128i) _mm_permute_pd((__m128d) xmm, 0x1)); \ | |||
| } | |||
| #define DECLARE_RESULT_4X(A, Bx, By) \ | |||
| __m512 result_00_##A##Bx##By = _mm512_setzero_ps(); \ | |||
| __m512 result_01_##A##Bx##By = _mm512_setzero_ps(); \ | |||
| __m512 result_10_##A##Bx##By = _mm512_setzero_ps(); \ | |||
| __m512 result_11_##A##Bx##By = _mm512_setzero_ps(); | |||
| #define FMA(a, b, r) r = _mm512_dpbf16_ps(r, (__m512bh)a, (__m512bh)b) | |||
| #define MATMUL_4X(A, Bx, By) \ | |||
| FMA(A_lo_##A, B_lo, result_00_##A##Bx##By); \ | |||
| FMA(A_hi_##A, B_lo, result_01_##A##Bx##By); \ | |||
| FMA(A_lo_##A, B_hi, result_10_##A##Bx##By); \ | |||
| FMA(A_hi_##A, B_hi, result_11_##A##Bx##By); | |||
| #define _STORE_C_2nx16(addr, val0, val1) \ | |||
| asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \ | |||
| asm("vfmadd213ps (%1, %3, 4), %2, %0": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc)); \ | |||
| asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); \ | |||
| asm("vmovups %0, (%1, %2, 4)": : "v"(val1), "r"(addr), "r"(ldc)) | |||
| #define _MASK_STORE_C_2nx16(addr, val0, val1) \ | |||
| asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \ | |||
| asm("vfmadd213ps (%1, %3, 4), %2, %0 %{%4%}": "+v"(val1) : "r"(addr), "v"(alpha_512), "r"(ldc), "k"(mmask)); \ | |||
| asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); \ | |||
| asm("vmovups %0, (%1, %2, 4) %{%3%}": : "v"(val1), "r"(addr), "r"(ldc), "k"(mmask)) | |||
| #define _REORDER_C_2X(result_0, result_1) { \ | |||
| __m512 tmp0, tmp1; \ | |||
| tmp0 = _mm512_unpacklo_ps(result_0, result_1); \ | |||
| tmp1 = _mm512_unpackhi_ps(result_0, result_1); \ | |||
| result_0 = (__m512) _mm512_unpacklo_pd((__m512d) tmp0, (__m512d) tmp1); \ | |||
| result_1 = (__m512) _mm512_unpackhi_pd((__m512d) tmp0, (__m512d) tmp1); \ | |||
| } | |||
| #define _STORE_2X(ptr_c, result_0, result_1) {\ | |||
| _REORDER_C_2X(result_0, result_1) \ | |||
| _STORE_C_2nx16(ptr_c, result_0, result_1); \ | |||
| ptr_c += ldc * 2; \ | |||
| } | |||
| #define _MASK_STORE_2X(ptr_c, result_0, result_1) {\ | |||
| _REORDER_C_2X(result_0, result_1) \ | |||
| _MASK_STORE_C_2nx16(ptr_c, result_0, result_1); \ | |||
| ptr_c += ldc * 2; \ | |||
| } | |||
| #define STORE_4X(A, Bx, By) { \ | |||
| _STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \ | |||
| _STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \ | |||
| } | |||
| #define MASK_STORE_4X(A, Bx, By) { \ | |||
| _MASK_STORE_2X(ptr_c##A, result_00_##A##Bx##By, result_01_##A##Bx##By); \ | |||
| _MASK_STORE_2X(ptr_c##A, result_10_##A##Bx##By, result_11_##A##Bx##By); \ | |||
| } | |||
| #define _STORE_C_16(addr, val0) \ | |||
| asm("vfmadd213ps (%1), %2, %0": "+v"(val0) : "r"(addr), "v"(alpha_512)); \ | |||
| asm("vmovups %0, (%1)": : "v"(val0), "r"(addr)); | |||
| #define _MASK_STORE_C_16(addr, val0) \ | |||
| asm("vfmadd213ps (%1), %2, %0 %{%3%} ": "+v"(val0) : "r"(addr), "v"(alpha_512), "k"(mmask)); \ | |||
| asm("vmovups %0, (%1) %{%2%}": : "v"(val0), "r"(addr), "k"(mmask)); | |||
| #define N_STORE_4X(A, Bx, By) { \ | |||
| _REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \ | |||
| _REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \ | |||
| switch(n_count) { \ | |||
| case 3: _STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \ | |||
| case 2: _STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \ | |||
| case 1: _STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \ | |||
| } \ | |||
| ptr_c##A += ldc * n_count; \ | |||
| } | |||
| #define N_MASK_STORE_4X(A, Bx, By) { \ | |||
| _REORDER_C_2X(result_00_##A##Bx##By, result_01_##A##Bx##By); \ | |||
| _REORDER_C_2X(result_10_##A##Bx##By, result_11_##A##Bx##By); \ | |||
| switch(n_count) { \ | |||
| case 3: _MASK_STORE_C_16(ptr_c + ldc * 2, result_10_##A##Bx##By); \ | |||
| case 2: _MASK_STORE_C_16(ptr_c + ldc * 1, result_01_##A##Bx##By); \ | |||
| case 1: _MASK_STORE_C_16(ptr_c + ldc * 0, result_00_##A##Bx##By); \ | |||
| } \ | |||
| ptr_c##A += ldc * n_count; \ | |||
| } | |||
| int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, IFLOAT * B, FLOAT * C, BLASLONG ldc) | |||
| { | |||
| IFLOAT *ptr_a = A, *ptr_b = B; | |||
| IFLOAT *ptr_b0, *ptr_b1; | |||
| IFLOAT *ptr_a0, *ptr_a1; | |||
| FLOAT *ptr_c = C; | |||
| FLOAT *ptr_c0, *ptr_c1; | |||
| BLASLONG n_count = n; | |||
| BLASLONG m_count, k_count; | |||
| BLASLONG n_blksize = 4 * k; | |||
| BLASLONG cn_offset = 0; | |||
| __m512 alpha_512 = _mm512_broadcastss_ps(_mm_load_ss(&alpha)); | |||
| for (; n_count > 23; n_count -= 24) { | |||
| IFLOAT *ptr_b00 = ptr_b; | |||
| IFLOAT *ptr_b10 = ptr_b + n_blksize * 3; | |||
| ptr_a0 = ptr_a; | |||
| ptr_c = C + cn_offset * ldc; | |||
| m_count = m; | |||
| for (; m_count > 15; m_count -= 16) { | |||
| ptr_b0 = ptr_b00; | |||
| ptr_b1 = ptr_b10; | |||
| DECLARE_A_PAIR(0); | |||
| DECLARE_B_PAIR(); | |||
| DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); | |||
| DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); | |||
| k_count = k; | |||
| for (; k_count > 3; k_count -=4) { | |||
| LOAD_A_PAIR(0); | |||
| _mm_prefetch(ptr_a0 + 128, _MM_HINT_T0); | |||
| ptr_a0 += 16 * 2; | |||
| BROADCAST_B_PAIR(0, 0); PREFETCH_B(0, 0); MATMUL_4X(0, 0, 0); | |||
| BROADCAST_B_PAIR(0, 1); PREFETCH_B(0, 1); MATMUL_4X(0, 0, 1); | |||
| BROADCAST_B_PAIR(0, 2); PREFETCH_B(0, 2); MATMUL_4X(0, 0, 2); | |||
| ptr_b0 += 4 * 2; | |||
| BROADCAST_B_PAIR(1, 0); PREFETCH_B(1, 0); MATMUL_4X(0, 1, 0); | |||
| BROADCAST_B_PAIR(1, 1); PREFETCH_B(1, 1); MATMUL_4X(0, 1, 1); | |||
| BROADCAST_B_PAIR(1, 2); PREFETCH_B(1, 2); MATMUL_4X(0, 1, 2); | |||
| ptr_b1 += 4 * 2; | |||
| LOAD_A_PAIR(0); | |||
| _mm_prefetch(ptr_a0 + 128, _MM_HINT_T0); | |||
| ptr_a0 += 16 * 2; | |||
| BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); | |||
| BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); | |||
| BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); | |||
| ptr_b0 += 4 * 2; | |||
| BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); | |||
| BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); | |||
| BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); | |||
| ptr_b1 += 4 * 2; | |||
| } | |||
| for (; k_count > 1; k_count -=2) { | |||
| LOAD_A_PAIR(0); | |||
| ptr_a0 += 16 * 2; | |||
| BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); | |||
| BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); | |||
| BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); | |||
| ptr_b0 += 4 * 2; | |||
| BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); | |||
| BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); | |||
| BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); | |||
| ptr_b1 += 4 * 2; | |||
| } | |||
| if (k_count > 0) { | |||
| LOAD_A_PAIR_TAIL(0); | |||
| ptr_a0 += 16; | |||
| BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); | |||
| BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); | |||
| BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); | |||
| ptr_b0 += 4; | |||
| BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); | |||
| BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); | |||
| BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); | |||
| ptr_b1 += 4; | |||
| } | |||
| ptr_c0 = ptr_c; | |||
| STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2); | |||
| STORE_4X(0, 1, 0); STORE_4X(0, 1, 1); STORE_4X(0, 1, 2); | |||
| ptr_c += 16; | |||
| } | |||
| if (m_count > 0) { | |||
| __mmask16 mmask = (1UL << m_count) - 1; | |||
| ptr_b0 = ptr_b00; | |||
| ptr_b1 = ptr_b10; | |||
| DECLARE_A_PAIR(0); | |||
| DECLARE_B_PAIR(); | |||
| DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); | |||
| DECLARE_RESULT_4X(0, 1, 0); DECLARE_RESULT_4X(0, 1, 1); DECLARE_RESULT_4X(0, 1, 2); | |||
| for (k_count = k; k_count > 1; k_count -=2) { | |||
| MASK_LOAD_A_PAIR(0); | |||
| ptr_a0 += m_count * 2; | |||
| BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); | |||
| BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); | |||
| BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); | |||
| ptr_b0 += 4 * 2; | |||
| BROADCAST_B_PAIR(1, 0); MATMUL_4X(0, 1, 0); | |||
| BROADCAST_B_PAIR(1, 1); MATMUL_4X(0, 1, 1); | |||
| BROADCAST_B_PAIR(1, 2); MATMUL_4X(0, 1, 2); | |||
| ptr_b1 += 4 * 2; | |||
| } | |||
| if (k_count > 0) { | |||
| MASK_LOAD_A_PAIR_TAIL(0); | |||
| ptr_a0 += m_count; | |||
| BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); | |||
| BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); | |||
| BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); | |||
| ptr_b0 += 4; | |||
| BROADCAST_B_PAIR_TAIL(1, 0); MATMUL_4X(0, 1, 0); | |||
| BROADCAST_B_PAIR_TAIL(1, 1); MATMUL_4X(0, 1, 1); | |||
| BROADCAST_B_PAIR_TAIL(1, 2); MATMUL_4X(0, 1, 2); | |||
| ptr_b1 += 4; | |||
| } | |||
| ptr_c0 = ptr_c; | |||
| MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2); | |||
| MASK_STORE_4X(0, 1, 0); MASK_STORE_4X(0, 1, 1); MASK_STORE_4X(0, 1, 2); | |||
| ptr_c += m_count; | |||
| } | |||
| ptr_b += 24 * k; | |||
| cn_offset += 24; | |||
| } | |||
| for (; n_count > 11; n_count -= 12) { | |||
| IFLOAT *ptr_b00 = ptr_b; | |||
| ptr_a0 = ptr_a; | |||
| ptr_a1 = ptr_a + 16 * k; | |||
| ptr_c = C + cn_offset * ldc; | |||
| m_count = m; | |||
| for (; m_count > 31; m_count -= 32) { | |||
| ptr_b0 = ptr_b00; | |||
| DECLARE_A_PAIR(0); DECLARE_A_PAIR(1); | |||
| DECLARE_B_PAIR(); | |||
| DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); | |||
| DECLARE_RESULT_4X(1, 0, 0); DECLARE_RESULT_4X(1, 0, 1); DECLARE_RESULT_4X(1, 0, 2); | |||
| for (k_count = k; k_count > 1; k_count -=2) { | |||
| LOAD_A_PAIR(0); LOAD_A_PAIR(1); | |||
| ptr_a0 += 16 * 2; | |||
| ptr_a1 += 16 * 2; | |||
| BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0); | |||
| BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1); | |||
| BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2); | |||
| ptr_b0 += 4 * 2; | |||
| } | |||
| if (k_count > 0) { | |||
| LOAD_A_PAIR_TAIL(0); LOAD_A_PAIR_TAIL(1); | |||
| ptr_a0 += 16; | |||
| ptr_a1 += 16; | |||
| BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); MATMUL_4X(1, 0, 0); | |||
| BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); MATMUL_4X(1, 0, 1); | |||
| BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); MATMUL_4X(1, 0, 2); | |||
| ptr_b0 += 4; | |||
| } | |||
| ptr_c0 = ptr_c; | |||
| ptr_c1 = ptr_c + 16; | |||
| STORE_4X(0, 0, 0); STORE_4X(1, 0, 0); | |||
| STORE_4X(0, 0, 1); STORE_4X(1, 0, 1); | |||
| STORE_4X(0, 0, 2); STORE_4X(1, 0, 2); | |||
| ptr_c += 16 * 2; | |||
| ptr_a0 = ptr_a1; | |||
| ptr_a1 = ptr_a0 + 16 * k; | |||
| } | |||
| for (; m_count > 15; m_count -= 16) { | |||
| ptr_b0 = ptr_b00; | |||
| DECLARE_A_PAIR(0); | |||
| DECLARE_B_PAIR(); | |||
| DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); | |||
| for (k_count = k; k_count > 1; k_count -=2) { | |||
| LOAD_A_PAIR(0); | |||
| ptr_a0 += 16 * 2; | |||
| BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); | |||
| BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); | |||
| BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); | |||
| ptr_b0 += 4 * 2; | |||
| } | |||
| if (k_count > 0) { | |||
| LOAD_A_PAIR_TAIL(0); | |||
| ptr_a0 += 16; | |||
| BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); | |||
| BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); | |||
| BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); | |||
| ptr_b0 += 4; | |||
| } | |||
| ptr_c0 = ptr_c; | |||
| STORE_4X(0, 0, 0); STORE_4X(0, 0, 1); STORE_4X(0, 0, 2); | |||
| ptr_c += 16; | |||
| } | |||
| if (m_count > 0) { | |||
| __mmask16 mmask = (1UL << m_count) - 1; | |||
| ptr_b0 = ptr_b00; | |||
| DECLARE_A_PAIR(0); | |||
| DECLARE_B_PAIR(); | |||
| DECLARE_RESULT_4X(0, 0, 0); DECLARE_RESULT_4X(0, 0, 1); DECLARE_RESULT_4X(0, 0, 2); | |||
| for (k_count = k; k_count > 1; k_count -=2) { | |||
| MASK_LOAD_A_PAIR(0); | |||
| ptr_a0 += m_count * 2; | |||
| BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); | |||
| BROADCAST_B_PAIR(0, 1); MATMUL_4X(0, 0, 1); | |||
| BROADCAST_B_PAIR(0, 2); MATMUL_4X(0, 0, 2); | |||
| ptr_b0 += 4 * 2; | |||
| } | |||
| if (k_count > 0) { | |||
| MASK_LOAD_A_PAIR_TAIL(0); | |||
| ptr_a0 += m_count; | |||
| BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); | |||
| BROADCAST_B_PAIR_TAIL(0, 1); MATMUL_4X(0, 0, 1); | |||
| BROADCAST_B_PAIR_TAIL(0, 2); MATMUL_4X(0, 0, 2); | |||
| ptr_b0 += 4; | |||
| } | |||
| ptr_c0 = ptr_c; | |||
| MASK_STORE_4X(0, 0, 0); MASK_STORE_4X(0, 0, 1); MASK_STORE_4X(0, 0, 2); | |||
| ptr_c += m_count; | |||
| } | |||
| ptr_b += 12 * k; | |||
| cn_offset += 12; | |||
| } | |||
| for (; n_count > 3; n_count -= 4) { | |||
| IFLOAT *ptr_b00 = ptr_b; | |||
| ptr_a0 = ptr_a; | |||
| ptr_c = C + cn_offset * ldc; | |||
| m_count = m; | |||
| for (; m_count > 15; m_count -= 16) { | |||
| ptr_b0 = ptr_b00; | |||
| DECLARE_A_PAIR(0); | |||
| DECLARE_B_PAIR(); | |||
| DECLARE_RESULT_4X(0, 0, 0); | |||
| for (k_count = k; k_count > 1; k_count -=2) { | |||
| LOAD_A_PAIR(0); | |||
| BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); | |||
| ptr_b0 += 4 * 2; | |||
| ptr_a0 += 16 * 2; | |||
| } | |||
| if (k_count > 0) { | |||
| LOAD_A_PAIR_TAIL(0); | |||
| BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); | |||
| ptr_b0 += 4; | |||
| ptr_a0 += 16; | |||
| } | |||
| ptr_c0 = ptr_c; | |||
| STORE_4X(0, 0, 0); | |||
| ptr_c += 16; | |||
| } | |||
| if (m_count > 0) { | |||
| __mmask16 mmask = (1UL << m_count) - 1; | |||
| ptr_b0 = ptr_b00; | |||
| DECLARE_A_PAIR(0); | |||
| DECLARE_B_PAIR(); | |||
| DECLARE_RESULT_4X(0, 0, 0); | |||
| for (k_count = k; k_count > 1; k_count -=2) { | |||
| MASK_LOAD_A_PAIR(0); | |||
| BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); | |||
| ptr_b0 += 4 * 2; | |||
| ptr_a0 += m_count * 2; | |||
| } | |||
| if (k_count > 0) { | |||
| MASK_LOAD_A_PAIR_TAIL(0); | |||
| BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); | |||
| ptr_b0 += 4; | |||
| ptr_a0 += m_count; | |||
| } | |||
| ptr_c0 = ptr_c; | |||
| MASK_STORE_4X(0, 0, 0); | |||
| ptr_c += m_count; | |||
| } | |||
| ptr_b += 4 * k; | |||
| cn_offset += 4; | |||
| } | |||
| if (n_count > 0) { | |||
| __mmask8 nmask = (1UL << n_count) - 1; | |||
| IFLOAT *ptr_b00 = ptr_b; | |||
| ptr_a0 = ptr_a; | |||
| ptr_c = C + cn_offset * ldc; | |||
| m_count = m; | |||
| for (; m_count > 15; m_count -= 16) { | |||
| ptr_b0 = ptr_b00; | |||
| DECLARE_A_PAIR(0); | |||
| DECLARE_B_PAIR(); | |||
| DECLARE_RESULT_4X(0, 0, 0); | |||
| for (k_count = k; k_count > 1; k_count -=2) { | |||
| LOAD_A_PAIR(0); | |||
| MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); | |||
| ptr_b0 += n_count * 2; | |||
| ptr_a0 += 16 * 2; | |||
| } | |||
| if (k_count > 0) { | |||
| LOAD_A_PAIR_TAIL(0); | |||
| MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); | |||
| ptr_b0 += n_count; | |||
| ptr_a0 += 16; | |||
| } | |||
| ptr_c0 = ptr_c; | |||
| N_STORE_4X(0, 0, 0); | |||
| ptr_c += 16; | |||
| } | |||
| if (m_count > 0) { | |||
| __mmask16 mmask = (1UL << m_count) - 1; | |||
| ptr_b0 = ptr_b00; | |||
| DECLARE_A_PAIR(0); | |||
| DECLARE_B_PAIR(); | |||
| DECLARE_RESULT_4X(0, 0, 0); | |||
| for (k_count = k; k_count > 1; k_count -=2) { | |||
| MASK_LOAD_A_PAIR(0); | |||
| MASK_BROADCAST_B_PAIR(0, 0); MATMUL_4X(0, 0, 0); | |||
| ptr_b0 += n_count * 2; | |||
| ptr_a0 += m_count * 2; | |||
| } | |||
| if (k_count > 0) { | |||
| MASK_LOAD_A_PAIR_TAIL(0); | |||
| MASK_BROADCAST_B_PAIR_TAIL(0, 0); MATMUL_4X(0, 0, 0); | |||
| ptr_b0 += n_count; | |||
| ptr_a0 += m_count; | |||
| } | |||
| ptr_c0 = ptr_c; | |||
| N_MASK_STORE_4X(0, 0, 0); | |||
| ptr_c += m_count; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,353 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include <immintrin.h> | |||
| #include "common.h" | |||
| #define _MM512_SHUFFLE_i32(result, in1, in2, imm8) \ | |||
| asm("vshufps %3, %2, %1, %0": "=v"(result): "v"(in1), "v"(in2), "N"(imm8)) | |||
| #define REORDER_8x32(t0, t1, t2, t3, t4, t5, t6, t7) { \ | |||
| __m512i v; \ | |||
| t0 = _mm512_unpacklo_epi32(r0, r1); \ | |||
| t1 = _mm512_unpackhi_epi32(r0, r1); \ | |||
| t2 = _mm512_unpacklo_epi32(r2, r3); \ | |||
| t3 = _mm512_unpackhi_epi32(r2, r3); \ | |||
| t4 = _mm512_unpacklo_epi32(r4, r5); \ | |||
| t5 = _mm512_unpackhi_epi32(r4, r5); \ | |||
| t6 = _mm512_unpacklo_epi32(r6, r7); \ | |||
| t7 = _mm512_unpackhi_epi32(r6, r7); \ | |||
| _MM512_SHUFFLE_i32(v, t0, t2, 0x4E); \ | |||
| r0 = _mm512_mask_blend_epi32(kc, t0, v); \ | |||
| r1 = _mm512_mask_blend_epi32(k3, t2, v); \ | |||
| _MM512_SHUFFLE_i32(v, t1, t3, 0x4E); \ | |||
| r2 = _mm512_mask_blend_epi32(kc, t1, v); \ | |||
| r3 = _mm512_mask_blend_epi32(k3, t3, v); \ | |||
| _MM512_SHUFFLE_i32(v, t4, t6, 0x4E); \ | |||
| r4 = _mm512_mask_blend_epi32(kc, t4, v); \ | |||
| r5 = _mm512_mask_blend_epi32(k3, t6, v); \ | |||
| _MM512_SHUFFLE_i32(v, t5, t7, 0x4E); \ | |||
| r6 = _mm512_mask_blend_epi32(kc, t5, v); \ | |||
| r7 = _mm512_mask_blend_epi32(k3, t7, v); \ | |||
| t0 = _mm512_permutex2var_epi32(r0, idx_lo, r4); \ | |||
| t1 = _mm512_permutex2var_epi32(r1, idx_lo, r5); \ | |||
| t2 = _mm512_permutex2var_epi32(r2, idx_lo, r6); \ | |||
| t3 = _mm512_permutex2var_epi32(r3, idx_lo, r7); \ | |||
| t4 = _mm512_permutex2var_epi32(r0, idx_hi, r4); \ | |||
| t5 = _mm512_permutex2var_epi32(r1, idx_hi, r5); \ | |||
| t6 = _mm512_permutex2var_epi32(r2, idx_hi, r6); \ | |||
| t7 = _mm512_permutex2var_epi32(r3, idx_hi, r7); \ | |||
| } | |||
| #define STORE_512_LO(x) \ | |||
| v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \ | |||
| _mm512_storeu_si512(boffset0 + x*32, v); | |||
| #define STORE_512_HI(x) \ | |||
| v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \ | |||
| _mm512_storeu_si512(boffset0 + (x + 8)*32, v); | |||
| #define MASK_STORE_512_LO(x) \ | |||
| v = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \ | |||
| _mm512_mask_storeu_epi32(boffset0 + 2*x*remain_n, nmask, v); | |||
| #define MASK_STORE_512_HI(x) \ | |||
| v = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \ | |||
| _mm512_mask_storeu_epi32(boffset0 + 2*(x + 8)*remain_n, nmask, v); | |||
| #define STORE_512(x, y) {\ | |||
| __m512i v; \ | |||
| if (x == 0) { STORE_512_LO(y); } \ | |||
| else { STORE_512_HI(y); } \ | |||
| } | |||
| #define MASK_STORE_512(x, y) {\ | |||
| __m512i v; \ | |||
| if (x == 0) { MASK_STORE_512_LO(y); } \ | |||
| else { MASK_STORE_512_HI(y); } \ | |||
| } | |||
| #define SET_TAIL(y, x) {\ | |||
| if (y == 0) tail = _mm512_permutex2var_epi64(t0##x, idx_lo2, t1##x); \ | |||
| else tail = _mm512_permutex2var_epi64(t0##x, idx_hi2, t1##x); \ | |||
| } | |||
| #define GET_TAIL() \ | |||
| switch (n_store + 1) { \ | |||
| case 16: SET_TAIL(1, 7); break; \ | |||
| case 15: SET_TAIL(1, 6); break; \ | |||
| case 14: SET_TAIL(1, 5); break; \ | |||
| case 13: SET_TAIL(1, 4); break; \ | |||
| case 12: SET_TAIL(1, 3); break; \ | |||
| case 11: SET_TAIL(1, 2); break; \ | |||
| case 10: SET_TAIL(1, 1); break; \ | |||
| case 9: SET_TAIL(1, 0); break; \ | |||
| case 8: SET_TAIL(0, 7); break; \ | |||
| case 7: SET_TAIL(0, 6); break; \ | |||
| case 6: SET_TAIL(0, 5); break; \ | |||
| case 5: SET_TAIL(0, 4); break; \ | |||
| case 4: SET_TAIL(0, 3); break; \ | |||
| case 3: SET_TAIL(0, 2); break; \ | |||
| case 2: SET_TAIL(0, 1); break; \ | |||
| case 1: SET_TAIL(0, 0); break; \ | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| IFLOAT *boffset0; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset00, *aoffset01, *aoffset02, *aoffset03, *aoffset04, *aoffset05, *aoffset06, *aoffset07; | |||
| IFLOAT *aoffset10, *aoffset11, *aoffset12, *aoffset13, *aoffset14, *aoffset15, *aoffset16, *aoffset17; | |||
| aoffset = a; | |||
| boffset0 = b; | |||
| BLASLONG n16 = n & ~15; | |||
| BLASLONG m32 = m & ~31; | |||
| int permute_table[] = { | |||
| 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, | |||
| 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, | |||
| }; | |||
| u_int64_t permute_table2[] = { | |||
| 0x00, 0x01, 0x02, 0x03, 8|0x0, 8|0x1, 8|0x2, 8|0x3, | |||
| 0x04, 0x05, 0x06, 0x07, 8|0x4, 8|0x5, 8|0x6, 8|0x7, | |||
| }; | |||
| __m512i idx_lo = _mm512_loadu_si512(permute_table); | |||
| __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); | |||
| __m512i idx_lo2 = _mm512_loadu_si512(permute_table2); | |||
| __m512i idx_hi2 = _mm512_loadu_si512(permute_table2 + 8); | |||
| __mmask16 kc = 0xcccc; | |||
| __mmask16 k3 = 0x3333; | |||
| __m512i r0, r1, r2, r3, r4, r5, r6, r7; | |||
| __m512i t00, t01, t02, t03, t04, t05, t06, t07; | |||
| __m512i t10, t11, t12, t13, t14, t15, t16, t17; | |||
| for (j = 0; j < n16; j += 16) { | |||
| aoffset00 = aoffset; | |||
| aoffset01 = aoffset00 + lda; | |||
| aoffset02 = aoffset01 + lda; | |||
| aoffset03 = aoffset02 + lda; | |||
| aoffset04 = aoffset03 + lda; | |||
| aoffset05 = aoffset04 + lda; | |||
| aoffset06 = aoffset05 + lda; | |||
| aoffset07 = aoffset06 + lda; | |||
| aoffset10 = aoffset07 + lda; | |||
| aoffset11 = aoffset10 + lda; | |||
| aoffset12 = aoffset11 + lda; | |||
| aoffset13 = aoffset12 + lda; | |||
| aoffset14 = aoffset13 + lda; | |||
| aoffset15 = aoffset14 + lda; | |||
| aoffset16 = aoffset15 + lda; | |||
| aoffset17 = aoffset16 + lda; | |||
| aoffset += 16 * lda; | |||
| for (i = 0; i < m32; i += 32) { | |||
| r0 = _mm512_loadu_si512(aoffset00 + i); | |||
| r1 = _mm512_loadu_si512(aoffset01 + i); | |||
| r2 = _mm512_loadu_si512(aoffset02 + i); | |||
| r3 = _mm512_loadu_si512(aoffset03 + i); | |||
| r4 = _mm512_loadu_si512(aoffset04 + i); | |||
| r5 = _mm512_loadu_si512(aoffset05 + i); | |||
| r6 = _mm512_loadu_si512(aoffset06 + i); | |||
| r7 = _mm512_loadu_si512(aoffset07 + i); | |||
| REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); | |||
| r0 = _mm512_loadu_si512(aoffset10 + i); | |||
| r1 = _mm512_loadu_si512(aoffset11 + i); | |||
| r2 = _mm512_loadu_si512(aoffset12 + i); | |||
| r3 = _mm512_loadu_si512(aoffset13 + i); | |||
| r4 = _mm512_loadu_si512(aoffset14 + i); | |||
| r5 = _mm512_loadu_si512(aoffset15 + i); | |||
| r6 = _mm512_loadu_si512(aoffset16 + i); | |||
| r7 = _mm512_loadu_si512(aoffset17 + i); | |||
| REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); | |||
| STORE_512(0, 0); STORE_512(0, 1); STORE_512(0, 2); STORE_512(0, 3); | |||
| STORE_512(0, 4); STORE_512(0, 5); STORE_512(0, 6); STORE_512(0, 7); | |||
| STORE_512(1, 0); STORE_512(1, 1); STORE_512(1, 2); STORE_512(1, 3); | |||
| STORE_512(1, 4); STORE_512(1, 5); STORE_512(1, 6); STORE_512(1, 7); | |||
| boffset0 += 16 * 32; | |||
| } | |||
| if (i < m) { | |||
| int remain_m = m - i; | |||
| __mmask32 mmask = (1UL << remain_m) - 1; | |||
| r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i); | |||
| r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i); | |||
| r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i); | |||
| r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i); | |||
| r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i); | |||
| r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i); | |||
| r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i); | |||
| r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i); | |||
| REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); | |||
| r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i); | |||
| r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i); | |||
| r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i); | |||
| r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i); | |||
| r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i); | |||
| r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i); | |||
| r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i); | |||
| r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i); | |||
| REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); | |||
| int n_store = remain_m/2; | |||
| switch (n_store) { | |||
| case 15: STORE_512(1, 6); | |||
| case 14: STORE_512(1, 5); | |||
| case 13: STORE_512(1, 4); | |||
| case 12: STORE_512(1, 3); | |||
| case 11: STORE_512(1, 2); | |||
| case 10: STORE_512(1, 1); | |||
| case 9: STORE_512(1, 0); | |||
| case 8: STORE_512(0, 7); | |||
| case 7: STORE_512(0, 6); | |||
| case 6: STORE_512(0, 5); | |||
| case 5: STORE_512(0, 4); | |||
| case 4: STORE_512(0, 3); | |||
| case 3: STORE_512(0, 2); | |||
| case 2: STORE_512(0, 1); | |||
| case 1: STORE_512(0, 0); | |||
| } | |||
| boffset0 += n_store * 32; | |||
| if (m & 0x1) { | |||
| __m512i tail; | |||
| GET_TAIL(); | |||
| _mm256_storeu_si256((void *)boffset0, _mm512_cvtepi32_epi16(tail)); | |||
| boffset0 += 16; | |||
| } | |||
| } | |||
| } | |||
| if (j < n) { | |||
| int remain_n = n - j; | |||
| __mmask16 nmask = (1UL << remain_n) - 1; | |||
| int load0, load1; | |||
| if (remain_n > 8) { | |||
| load0 = 8; | |||
| load1 = remain_n - 8; | |||
| } else { | |||
| load0 = remain_n; | |||
| load1 = 0; | |||
| } | |||
| aoffset00 = aoffset; | |||
| aoffset01 = aoffset00 + lda; | |||
| aoffset02 = aoffset01 + lda; | |||
| aoffset03 = aoffset02 + lda; | |||
| aoffset04 = aoffset03 + lda; | |||
| aoffset05 = aoffset04 + lda; | |||
| aoffset06 = aoffset05 + lda; | |||
| aoffset07 = aoffset06 + lda; | |||
| aoffset10 = aoffset07 + lda; | |||
| aoffset11 = aoffset10 + lda; | |||
| aoffset12 = aoffset11 + lda; | |||
| aoffset13 = aoffset12 + lda; | |||
| aoffset14 = aoffset13 + lda; | |||
| aoffset15 = aoffset14 + lda; | |||
| aoffset16 = aoffset15 + lda; | |||
| aoffset17 = aoffset16 + lda; | |||
| aoffset += 16 * lda; | |||
| for (i = 0; i < m32; i += 32) { | |||
| switch (load0) { | |||
| case 8: r7 = _mm512_loadu_si512(aoffset07 + i); | |||
| case 7: r6 = _mm512_loadu_si512(aoffset06 + i); | |||
| case 6: r5 = _mm512_loadu_si512(aoffset05 + i); | |||
| case 5: r4 = _mm512_loadu_si512(aoffset04 + i); | |||
| case 4: r3 = _mm512_loadu_si512(aoffset03 + i); | |||
| case 3: r2 = _mm512_loadu_si512(aoffset02 + i); | |||
| case 2: r1 = _mm512_loadu_si512(aoffset01 + i); | |||
| case 1: r0 = _mm512_loadu_si512(aoffset00 + i); | |||
| } | |||
| REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); | |||
| switch (load1) { | |||
| case 8: r7 = _mm512_loadu_si512(aoffset17 + i); | |||
| case 7: r6 = _mm512_loadu_si512(aoffset16 + i); | |||
| case 6: r5 = _mm512_loadu_si512(aoffset15 + i); | |||
| case 5: r4 = _mm512_loadu_si512(aoffset14 + i); | |||
| case 4: r3 = _mm512_loadu_si512(aoffset13 + i); | |||
| case 3: r2 = _mm512_loadu_si512(aoffset12 + i); | |||
| case 2: r1 = _mm512_loadu_si512(aoffset11 + i); | |||
| case 1: r0 = _mm512_loadu_si512(aoffset10 + i); | |||
| } | |||
| REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); | |||
| MASK_STORE_512(0, 0); MASK_STORE_512(0, 1); MASK_STORE_512(0, 2); MASK_STORE_512(0, 3); | |||
| MASK_STORE_512(0, 4); MASK_STORE_512(0, 5); MASK_STORE_512(0, 6); MASK_STORE_512(0, 7); | |||
| MASK_STORE_512(1, 0); MASK_STORE_512(1, 1); MASK_STORE_512(1, 2); MASK_STORE_512(1, 3); | |||
| MASK_STORE_512(1, 4); MASK_STORE_512(1, 5); MASK_STORE_512(1, 6); MASK_STORE_512(1, 7); | |||
| boffset0 += remain_n * 32; | |||
| } | |||
| if (i < m) { | |||
| int remain_m = m - i; | |||
| __mmask32 mmask = (1UL << remain_m) - 1; | |||
| switch (load0) { | |||
| case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset07 + i); | |||
| case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset06 + i); | |||
| case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset05 + i); | |||
| case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset04 + i); | |||
| case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset03 + i); | |||
| case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset02 + i); | |||
| case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset01 + i); | |||
| case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset00 + i); | |||
| } | |||
| REORDER_8x32(t00, t01, t02, t03, t04, t05, t06, t07); | |||
| switch (load1) { | |||
| case 8: r7 = _mm512_maskz_loadu_epi16(mmask, aoffset17 + i); | |||
| case 7: r6 = _mm512_maskz_loadu_epi16(mmask, aoffset16 + i); | |||
| case 6: r5 = _mm512_maskz_loadu_epi16(mmask, aoffset15 + i); | |||
| case 5: r4 = _mm512_maskz_loadu_epi16(mmask, aoffset14 + i); | |||
| case 4: r3 = _mm512_maskz_loadu_epi16(mmask, aoffset13 + i); | |||
| case 3: r2 = _mm512_maskz_loadu_epi16(mmask, aoffset12 + i); | |||
| case 2: r1 = _mm512_maskz_loadu_epi16(mmask, aoffset11 + i); | |||
| case 1: r0 = _mm512_maskz_loadu_epi16(mmask, aoffset10 + i); | |||
| } | |||
| REORDER_8x32(t10, t11, t12, t13, t14, t15, t16, t17); | |||
| int n_store = remain_m/2; | |||
| switch (n_store) { | |||
| case 15: MASK_STORE_512(1, 6); | |||
| case 14: MASK_STORE_512(1, 5); | |||
| case 13: MASK_STORE_512(1, 4); | |||
| case 12: MASK_STORE_512(1, 3); | |||
| case 11: MASK_STORE_512(1, 2); | |||
| case 10: MASK_STORE_512(1, 1); | |||
| case 9: MASK_STORE_512(1, 0); | |||
| case 8: MASK_STORE_512(0, 7); | |||
| case 7: MASK_STORE_512(0, 6); | |||
| case 6: MASK_STORE_512(0, 5); | |||
| case 5: MASK_STORE_512(0, 4); | |||
| case 4: MASK_STORE_512(0, 3); | |||
| case 3: MASK_STORE_512(0, 2); | |||
| case 2: MASK_STORE_512(0, 1); | |||
| case 1: MASK_STORE_512(0, 0); | |||
| } | |||
| boffset0 += n_store * remain_n * 2; | |||
| if (m & 0x1) { | |||
| __m512i tail; | |||
| GET_TAIL(); | |||
| _mm256_mask_storeu_epi16((void *)boffset0, nmask, _mm512_cvtepi32_epi16(tail)); | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,208 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include <immintrin.h> | |||
| #include "common.h" | |||
| #define REORDER_4x32(r0, r1, r2, r3) {\ | |||
| __m512i t0, t1, t2, t3; \ | |||
| t0 = _mm512_unpacklo_epi32(r0, r1); \ | |||
| t1 = _mm512_unpackhi_epi32(r0, r1); \ | |||
| t2 = _mm512_unpacklo_epi32(r2, r3); \ | |||
| t3 = _mm512_unpackhi_epi32(r2, r3); \ | |||
| r0 = _mm512_unpacklo_epi64(t0, t2); \ | |||
| r1 = _mm512_unpackhi_epi64(t0, t2); \ | |||
| r2 = _mm512_unpacklo_epi64(t1, t3); \ | |||
| r3 = _mm512_unpackhi_epi64(t1, t3); \ | |||
| t0 = _mm512_permutex2var_epi32(r0, idx_lo_128, r1); \ | |||
| t1 = _mm512_permutex2var_epi32(r0, idx_hi_128, r1); \ | |||
| t2 = _mm512_permutex2var_epi32(r2, idx_lo_128, r3); \ | |||
| t3 = _mm512_permutex2var_epi32(r2, idx_hi_128, r3); \ | |||
| r0 = _mm512_permutex2var_epi32(t0, idx_lo_256, t2); \ | |||
| r1 = _mm512_permutex2var_epi32(t1, idx_lo_256, t3); \ | |||
| r2 = _mm512_permutex2var_epi32(t0, idx_hi_256, t2); \ | |||
| r3 = _mm512_permutex2var_epi32(t1, idx_hi_256, t3); \ | |||
| } | |||
| #define REORDER_4x8(r0, r1, r2, r3) {\ | |||
| __m128i t0, t1, t2, t3; \ | |||
| t0 = _mm_unpacklo_epi32(r0, r1); \ | |||
| t1 = _mm_unpackhi_epi32(r0, r1); \ | |||
| t2 = _mm_unpacklo_epi32(r2, r3); \ | |||
| t3 = _mm_unpackhi_epi32(r2, r3); \ | |||
| r0 = _mm_unpacklo_epi64(t0, t2); \ | |||
| r1 = _mm_unpackhi_epi64(t0, t2); \ | |||
| r2 = _mm_unpacklo_epi64(t1, t3); \ | |||
| r3 = _mm_unpackhi_epi64(t1, t3); \ | |||
| } | |||
| #define GET_TAIL(tail, remain_m) \ | |||
| switch((remain_m + 1)/2) { \ | |||
| case 1: tail = r0; break; \ | |||
| case 2: tail = r1; break; \ | |||
| case 3: tail = r2; break; \ | |||
| case 4: tail = r3; break; \ | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset0, *aoffset1, *aoffset2, *aoffset3; | |||
| IFLOAT *boffset; | |||
| aoffset = a; | |||
| boffset = b; | |||
| BLASLONG m32 = m & ~31; | |||
| BLASLONG m8 = m & ~7; | |||
| BLASLONG n4 = n & ~3; | |||
| int permute_table[] = { | |||
| 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, | |||
| 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, | |||
| 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, | |||
| 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, | |||
| }; | |||
| __m512i idx_lo_128 = _mm512_loadu_si512(permute_table); | |||
| __m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16); | |||
| __m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32); | |||
| __m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48); | |||
| for (j = 0; j < n4; j += 4) { | |||
| aoffset0 = aoffset; | |||
| aoffset1 = aoffset0 + lda; | |||
| aoffset2 = aoffset1 + lda; | |||
| aoffset3 = aoffset2 + lda; | |||
| aoffset += 4 * lda; | |||
| for (i = 0; i < m32; i += 32) { | |||
| __m512i r0, r1, r2, r3; | |||
| r0 = _mm512_loadu_si512(aoffset0 + i); | |||
| r1 = _mm512_loadu_si512(aoffset1 + i); | |||
| r2 = _mm512_loadu_si512(aoffset2 + i); | |||
| r3 = _mm512_loadu_si512(aoffset3 + i); | |||
| REORDER_4x32(r0, r1, r2, r3); | |||
| _mm512_storeu_si512(boffset + 32*0, r0); | |||
| _mm512_storeu_si512(boffset + 32*1, r1); | |||
| _mm512_storeu_si512(boffset + 32*2, r2); | |||
| _mm512_storeu_si512(boffset + 32*3, r3); | |||
| boffset += 32 * 4; | |||
| } | |||
| for (; i < m8; i += 8) { | |||
| __m128i r0 = _mm_loadu_si128((void *)(aoffset0 + i)); | |||
| __m128i r1 = _mm_loadu_si128((void *)(aoffset1 + i)); | |||
| __m128i r2 = _mm_loadu_si128((void *)(aoffset2 + i)); | |||
| __m128i r3 = _mm_loadu_si128((void *)(aoffset3 + i)); | |||
| REORDER_4x8(r0, r1, r2, r3); | |||
| _mm_storeu_si128((void *)(boffset + 8*0), r0); | |||
| _mm_storeu_si128((void *)(boffset + 8*1), r1); | |||
| _mm_storeu_si128((void *)(boffset + 8*2), r2); | |||
| _mm_storeu_si128((void *)(boffset + 8*3), r3); | |||
| boffset += 8 * 4; | |||
| } | |||
| if (i < m) { | |||
| int remain_m = m - i; | |||
| __mmask8 r_mask = (1UL << remain_m) - 1; | |||
| __m128i r0 = _mm_maskz_loadu_epi16(r_mask, aoffset0 + i); | |||
| __m128i r1 = _mm_maskz_loadu_epi16(r_mask, aoffset1 + i); | |||
| __m128i r2 = _mm_maskz_loadu_epi16(r_mask, aoffset2 + i); | |||
| __m128i r3 = _mm_maskz_loadu_epi16(r_mask, aoffset3 + i); | |||
| REORDER_4x8(r0, r1, r2, r3); | |||
| // store should skip the tail odd line | |||
| int num_store = remain_m/2; | |||
| switch(num_store) { | |||
| case 3: _mm_storeu_si128((void *)(boffset + 8*2), r2); | |||
| case 2: _mm_storeu_si128((void *)(boffset + 8*1), r1); | |||
| case 1: _mm_storeu_si128((void *)(boffset + 8*0), r0); | |||
| } | |||
| boffset += 8 * num_store; | |||
| if (m & 0x1) { // handling the tail | |||
| __m128i tail; | |||
| GET_TAIL(tail, remain_m); | |||
| /* tail vector is fill with zero like: | |||
| * a, 0, b, 0, c, 0, d, 0 | |||
| * need to extract lo words of data and store | |||
| */ | |||
| tail = _mm_cvtepi32_epi16(tail); | |||
| _mm_store_sd((double *)boffset, (__m128d) tail); // only lower 4 bfloat valid | |||
| boffset += 4; | |||
| } | |||
| } | |||
| } | |||
| if (j < n) { | |||
| int remain_n = n - j; | |||
| __mmask8 nmask = (1UL << remain_n) - 1; | |||
| aoffset0 = aoffset; | |||
| aoffset1 = aoffset0 + lda; | |||
| aoffset2 = aoffset1 + lda; | |||
| aoffset3 = aoffset2 + lda; | |||
| __m128i r0, r1, r2, r3; | |||
| for (i = 0; i < m8; i += 8) { | |||
| switch (remain_n) { | |||
| case 3: r2 = _mm_loadu_si128((void *)(aoffset2 + i)); | |||
| case 2: r1 = _mm_loadu_si128((void *)(aoffset1 + i)); | |||
| case 1: r0 = _mm_loadu_si128((void *)(aoffset0 + i)); | |||
| } | |||
| REORDER_4x8(r0, r1, r2, r3); | |||
| _mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0); | |||
| _mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1); | |||
| _mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2); | |||
| _mm_mask_storeu_epi32(boffset + remain_n * 6, nmask, r3); | |||
| boffset += 8 * remain_n; | |||
| } | |||
| if (i < m) { | |||
| int remain_m = m - i; | |||
| __mmask8 mmask = (1UL << remain_m) - 1; | |||
| switch (remain_n) { | |||
| case 3: r2 = _mm_maskz_loadu_epi16(mmask, aoffset2 + i); | |||
| case 2: r1 = _mm_maskz_loadu_epi16(mmask, aoffset1 + i); | |||
| case 1: r0 = _mm_maskz_loadu_epi16(mmask, aoffset0 + i); | |||
| } | |||
| REORDER_4x8(r0, r1, r2, r3); | |||
| int num_store = remain_m/2; | |||
| switch (num_store) { | |||
| case 3: _mm_mask_storeu_epi32(boffset + remain_n * 4, nmask, r2); | |||
| case 2: _mm_mask_storeu_epi32(boffset + remain_n * 2, nmask, r1); | |||
| case 1: _mm_mask_storeu_epi32(boffset + remain_n * 0, nmask, r0); | |||
| } | |||
| boffset += 2 * num_store * remain_n; | |||
| if (m & 0x1) { | |||
| __m128i tail; | |||
| GET_TAIL(tail, remain_m); | |||
| tail = _mm_cvtepi32_epi16(tail); | |||
| _mm_mask_storeu_epi16(boffset, nmask, tail); | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -38,5 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| int CNAME(int transa, int transb, BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, FLOAT beta) | |||
| { | |||
| return 1; | |||
| double MNK = (double) M * (double) N * (double) K; | |||
| if (MNK > 256.0*256.0*256.0) // disable for big size matrix | |||
| return 0; | |||
| /* small matrix kernel works well for N = 8, 16, 32 */ | |||
| if (N == 8 || N == 16 || N == 32) | |||
| return 1; | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,164 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include <immintrin.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| IFLOAT *boffset0, *boffset1; | |||
| boffset0 = b; | |||
| BLASLONG n32 = n & ~31; | |||
| BLASLONG m4 = m & ~3; | |||
| BLASLONG m2 = m & ~1; | |||
| uint32_t permute_table[] = { | |||
| 0x00, 0x01, 0x02, 0x03, 0x10, 0x11, 0x12, 0x13, 0x04, 0x05, 0x06, 0x07, 0x14, 0x15, 0x16, 0x17, | |||
| 0x08, 0x09, 0x0a, 0x0b, 0x18, 0x19, 0x1a, 0x1b, 0x0c, 0x0d, 0x0e, 0x0f, 0x1c, 0x1d, 0x1e, 0x1f, | |||
| }; | |||
| __m512i idx_lo = _mm512_loadu_si512(permute_table); | |||
| __m512i idx_hi = _mm512_loadu_si512(permute_table + 16); | |||
| for (j = 0; j < n32; j += 32) { | |||
| /* process 2x16 n at the same time */ | |||
| boffset1 = boffset0 + m * 16; | |||
| for (i = 0; i < m4; i += 4) { | |||
| /* bf16 fma need special memory layout: | |||
| * for memory layout like below: | |||
| * a00, a01, a02, a03, a04, a05 .... | |||
| * a10, a11, a12, a13, a14, a15 .... | |||
| * need to copy as: | |||
| * a00, a10, a01, a11, a02, a12, a03, a13, ... | |||
| */ | |||
| __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); | |||
| __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); | |||
| __m512i a2 = _mm512_loadu_si512(&a[(i + 2)*lda + j]); | |||
| __m512i a3 = _mm512_loadu_si512(&a[(i + 3)*lda + j]); | |||
| __m512i a00 = _mm512_unpacklo_epi16(a0, a1); | |||
| __m512i a01 = _mm512_unpackhi_epi16(a0, a1); | |||
| __m512i a10 = _mm512_unpacklo_epi16(a2, a3); | |||
| __m512i a11 = _mm512_unpackhi_epi16(a2, a3); | |||
| a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); | |||
| a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); | |||
| a2 = _mm512_permutex2var_epi32(a10, idx_lo, a11); | |||
| a3 = _mm512_permutex2var_epi32(a10, idx_hi, a11); | |||
| _mm512_storeu_si512(boffset0, a0); | |||
| _mm512_storeu_si512(boffset1, a1); | |||
| _mm512_storeu_si512(boffset0 + 32, a2); | |||
| _mm512_storeu_si512(boffset1 + 32, a3); | |||
| boffset0 += 64; | |||
| boffset1 += 64; | |||
| } | |||
| for (; i < m2; i += 2) { | |||
| __m512i a0 = _mm512_loadu_si512(&a[(i + 0)*lda + j]); | |||
| __m512i a1 = _mm512_loadu_si512(&a[(i + 1)*lda + j]); | |||
| __m512i a00 = _mm512_unpacklo_epi16(a0, a1); | |||
| __m512i a01 = _mm512_unpackhi_epi16(a0, a1); | |||
| a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); | |||
| a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); | |||
| _mm512_storeu_si512(boffset0, a0); | |||
| _mm512_storeu_si512(boffset1, a1); | |||
| boffset0 += 32; | |||
| boffset1 += 32; | |||
| } | |||
| for (; i < m; i++) { | |||
| /* just copy the only remains row */ | |||
| __m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]); | |||
| __m256i a1 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j + 16]); | |||
| _mm256_storeu_si256((void *)boffset0, a0); | |||
| _mm256_storeu_si256((void *)boffset1, a1); | |||
| boffset0 += 16; | |||
| boffset1 += 16; | |||
| } | |||
| boffset0 = boffset1; | |||
| } | |||
| if (j < n) { | |||
| uint32_t remains = n - j; | |||
| __mmask32 r_mask = (1UL << remains) - 1; | |||
| if (remains > 16) { | |||
| boffset1 = boffset0 + m * 16; | |||
| uint32_t tail1 = remains - 16; | |||
| __mmask16 w_mask1 = (1UL << tail1) - 1; | |||
| for (i = 0; i < m2; i += 2) { | |||
| __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); | |||
| __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); | |||
| __m512i a00 = _mm512_unpacklo_epi16(a0, a1); | |||
| __m512i a01 = _mm512_unpackhi_epi16(a0, a1); | |||
| a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); | |||
| a1 = _mm512_permutex2var_epi32(a00, idx_hi, a01); | |||
| _mm512_storeu_si512(boffset0, a0); | |||
| _mm512_mask_storeu_epi32(boffset1, w_mask1, a1); | |||
| boffset0 += 32; | |||
| boffset1 += 2 * tail1; | |||
| } | |||
| for (; i < m; i++) { | |||
| __m256i a0 = _mm256_loadu_si256((void *)&a[(i + 0)*lda + j]); | |||
| __m256i a1 = _mm256_maskz_loadu_epi16(w_mask1, (void *)&a[(i + 0)*lda + j + 16]); | |||
| _mm256_storeu_si256((void *)boffset0, a0); | |||
| _mm256_mask_storeu_epi16((void *)boffset1, w_mask1, a1); | |||
| boffset0 += 16; | |||
| boffset1 += tail1; | |||
| } | |||
| } else { | |||
| __mmask16 w_mask = (1UL << remains ) - 1; | |||
| for (i = 0; i < m2; i += 2) { | |||
| __m512i a0 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); | |||
| __m512i a1 = _mm512_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); | |||
| __m512i a00 = _mm512_unpacklo_epi16(a0, a1); | |||
| __m512i a01 = _mm512_unpackhi_epi16(a0, a1); | |||
| a0 = _mm512_permutex2var_epi32(a00, idx_lo, a01); | |||
| _mm512_mask_storeu_epi32(boffset0, w_mask, a0); | |||
| boffset0 += 2 * remains; | |||
| } | |||
| for (; i < m; i++) { | |||
| __m256i a0 = _mm256_maskz_loadu_epi16(w_mask, &a[(i + 0)*lda + j]); | |||
| _mm256_mask_storeu_epi16(boffset0, w_mask, a0); | |||
| boffset0 += remains; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,216 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2021, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include <immintrin.h> | |||
| #include "common.h" | |||
| #define STORE_VEC(Bx, By, vec) \ | |||
| if (By == 0) asm("vmovdqu16 %0, (%1)": : "v"(vec), "r"(boffset##Bx)); \ | |||
| else asm("vmovdqu16 %0, (%1, %2, %c3)": : "v"(vec), "r"(boffset##Bx), "r"(blk_size), "n"(By * 2)); | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| IFLOAT *boffset0, *boffset1; | |||
| boffset0 = b; | |||
| BLASLONG n24 = n - (n % 24); | |||
| BLASLONG n8 = n & ~7; | |||
| BLASLONG m8 = m & ~7; | |||
| BLASLONG m4 = m & ~3; | |||
| BLASLONG m2 = m & ~1; | |||
| int permute_table[] = { | |||
| 0x0, 0x1, 0x2, 0x3, 0x10, 0x11, 0x12, 0x13, 0x8, 0x9, 0xa, 0xb, 0x18, 0x19, 0x1a, 0x1b, | |||
| 0x4, 0x5, 0x6, 0x7, 0x14, 0x15, 0x16, 0x17, 0xc, 0xd, 0xe, 0xf, 0x1c, 0x1d, 0x1e, 0x1f, | |||
| 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, | |||
| 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, | |||
| }; | |||
| j = 0; | |||
| if (n > 23) { | |||
| /* n = 24 is the max width in current blocking setting */ | |||
| __m512i idx_lo_128 = _mm512_loadu_si512(permute_table); | |||
| __m512i idx_hi_128 = _mm512_loadu_si512(permute_table + 16); | |||
| __m512i idx_lo_256 = _mm512_loadu_si512(permute_table + 32); | |||
| __m512i idx_hi_256 = _mm512_loadu_si512(permute_table + 48); | |||
| __mmask32 mask24 = (1UL << 24) - 1; | |||
| BLASLONG blk_size = m * 4; | |||
| BLASLONG stride = blk_size * 3; | |||
| for (; j < n24; j += 24) { | |||
| boffset1 = boffset0 + stride; | |||
| for (i = 0; i < m8; i += 8) { | |||
| __m512i r0, r1, r2, r3, r4, r5, r6, r7; | |||
| __m512i t0, t1, t2, t3, t4, t5, t6, t7; | |||
| r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]); | |||
| r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]); | |||
| r2 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 2)*lda + j]); | |||
| r3 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 3)*lda + j]); | |||
| r4 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 4)*lda + j]); | |||
| r5 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 5)*lda + j]); | |||
| r6 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 6)*lda + j]); | |||
| r7 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 7)*lda + j]); | |||
| t0 = _mm512_unpacklo_epi16(r0, r1); | |||
| t1 = _mm512_unpackhi_epi16(r0, r1); | |||
| t2 = _mm512_unpacklo_epi16(r2, r3); | |||
| t3 = _mm512_unpackhi_epi16(r2, r3); | |||
| t4 = _mm512_unpacklo_epi16(r4, r5); | |||
| t5 = _mm512_unpackhi_epi16(r4, r5); | |||
| t6 = _mm512_unpacklo_epi16(r6, r7); | |||
| t7 = _mm512_unpackhi_epi16(r6, r7); | |||
| r0 = _mm512_permutex2var_epi32(t0, idx_lo_128, t2); | |||
| r1 = _mm512_permutex2var_epi32(t1, idx_lo_128, t3); | |||
| r2 = _mm512_permutex2var_epi32(t4, idx_lo_128, t6); | |||
| r3 = _mm512_permutex2var_epi32(t5, idx_lo_128, t7); | |||
| r4 = _mm512_permutex2var_epi32(t0, idx_hi_128, t2); | |||
| r5 = _mm512_permutex2var_epi32(t1, idx_hi_128, t3); | |||
| r6 = _mm512_permutex2var_epi32(t4, idx_hi_128, t6); | |||
| r7 = _mm512_permutex2var_epi32(t5, idx_hi_128, t7); | |||
| t0 = _mm512_permutex2var_epi32(r0, idx_lo_256, r2); | |||
| t1 = _mm512_permutex2var_epi32(r1, idx_lo_256, r3); | |||
| t2 = _mm512_permutex2var_epi32(r4, idx_lo_256, r6); | |||
| t3 = _mm512_permutex2var_epi32(r5, idx_lo_256, r7); | |||
| t4 = _mm512_permutex2var_epi32(r0, idx_hi_256, r2); | |||
| t5 = _mm512_permutex2var_epi32(r1, idx_hi_256, r3); | |||
| STORE_VEC(0, 0, t0); STORE_VEC(0, 1, t1); STORE_VEC(0, 2, t2); | |||
| STORE_VEC(1, 0, t3); STORE_VEC(1, 1, t4); STORE_VEC(1, 2, t5); | |||
| boffset0 += 32; | |||
| boffset1 += 32; | |||
| } | |||
| for (; i < m2; i += 2) { | |||
| __m512i r0, r1, t0, t1; | |||
| r0 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 0)*lda + j]); | |||
| r1 = _mm512_maskz_loadu_epi16(mask24, &a[(i + 1)*lda + j]); | |||
| t0 = _mm512_unpacklo_epi16(r0, r1); | |||
| t1 = _mm512_unpackhi_epi16(r0, r1); | |||
| STORE_VEC(0, 0, _mm512_extracti32x4_epi32(t0, 0)); | |||
| STORE_VEC(0, 1, _mm512_extracti32x4_epi32(t1, 0)); | |||
| STORE_VEC(0, 2, _mm512_extracti32x4_epi32(t0, 1)); | |||
| STORE_VEC(1, 0, _mm512_extracti32x4_epi32(t1, 1)); | |||
| STORE_VEC(1, 1, _mm512_extracti32x4_epi32(t0, 2)); | |||
| STORE_VEC(1, 2, _mm512_extracti32x4_epi32(t1, 2)); | |||
| boffset0 += 8; | |||
| boffset1 += 8; | |||
| } | |||
| for (; i < m; i++) { | |||
| *(uint64_t *)(boffset0 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 0]; | |||
| *(uint64_t *)(boffset0 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 4]; | |||
| *(uint64_t *)(boffset0 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 8]; | |||
| *(uint64_t *)(boffset1 + blk_size * 0) = *(uint64_t *)&a[i * lda + j + 12]; | |||
| *(uint64_t *)(boffset1 + blk_size * 1) = *(uint64_t *)&a[i * lda + j + 16]; | |||
| *(uint64_t *)(boffset1 + blk_size * 2) = *(uint64_t *)&a[i * lda + j + 20]; | |||
| boffset0 += 4; | |||
| boffset1 += 4; | |||
| } | |||
| boffset0 += stride * 2; | |||
| } | |||
| } | |||
| for (; j < n8; j += 8) { | |||
| boffset1 = boffset0 + m * 4; | |||
| for (i = 0; i < m4; i += 4) { | |||
| __m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]); | |||
| __m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]); | |||
| __m128i a2 = _mm_loadu_si128((void *)&a[(i + 2)*lda + j]); | |||
| __m128i a3 = _mm_loadu_si128((void *)&a[(i + 3)*lda + j]); | |||
| __m128i a00 = _mm_unpacklo_epi16(a0, a1); | |||
| __m128i a01 = _mm_unpackhi_epi16(a0, a1); | |||
| __m128i a10 = _mm_unpacklo_epi16(a2, a3); | |||
| __m128i a11 = _mm_unpackhi_epi16(a2, a3); | |||
| _mm_storeu_si128((void *)(boffset0 + 0), a00); | |||
| _mm_storeu_si128((void *)(boffset0 + 8), a10); | |||
| _mm_storeu_si128((void *)(boffset1 + 0), a01); | |||
| _mm_storeu_si128((void *)(boffset1 + 8), a11); | |||
| boffset0 += 16; | |||
| boffset1 += 16; | |||
| } | |||
| for (; i < m2; i+= 2) { | |||
| __m128i a0 = _mm_loadu_si128((void *)&a[(i + 0)*lda + j]); | |||
| __m128i a1 = _mm_loadu_si128((void *)&a[(i + 1)*lda + j]); | |||
| __m128i a00 = _mm_unpacklo_epi16(a0, a1); | |||
| __m128i a01 = _mm_unpackhi_epi16(a0, a1); | |||
| _mm_storeu_si128((void *)(boffset0 + 0), a00); | |||
| _mm_storeu_si128((void *)(boffset1 + 0), a01); | |||
| boffset0 += 8; | |||
| boffset1 += 8; | |||
| } | |||
| for (; i < m; i++) { | |||
| __m128d a0 = _mm_loadu_pd((void *)&a[(i + 0)*lda + j]); | |||
| _mm_store_sd((void *)boffset0, a0); | |||
| _mm_store_sd((void *)boffset1, _mm_permute_pd(a0, 0x1)); | |||
| boffset0 += 4; | |||
| boffset1 += 4; | |||
| } | |||
| boffset0 = boffset1; | |||
| } | |||
| if (j < n) { | |||
| uint32_t remains = n - j; | |||
| __mmask8 r_mask = (1UL << remains) - 1; | |||
| if (remains > 4) { | |||
| boffset1 = boffset0 + m * 4; | |||
| uint32_t tail1 = remains - 4; | |||
| __mmask8 w_mask1 = (1UL << tail1) - 1; | |||
| for (i = 0; i < m2; i += 2) { | |||
| __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); | |||
| __m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); | |||
| __m128i a00 = _mm_unpacklo_epi16(a0, a1); | |||
| __m128i a01 = _mm_unpackhi_epi16(a0, a1); | |||
| _mm_storeu_si128((void *)boffset0, a00); | |||
| _mm_mask_storeu_epi32((void *)boffset1, w_mask1, a01); | |||
| boffset0 += 8; | |||
| boffset1 += 2 * tail1; | |||
| } | |||
| for (; i < m; i++) { | |||
| __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); | |||
| _mm_store_sd((void *)boffset0, (__m128d) a0); | |||
| _mm_mask_storeu_epi16((void *)boffset1, w_mask1, (__m128i) _mm_permute_pd((__m128d) a0, 0x1)); | |||
| boffset0 += 4; | |||
| boffset1 += tail1; | |||
| } | |||
| } else { | |||
| for (i = 0; i < m2; i += 2) { | |||
| __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); | |||
| __m128i a1 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 1)*lda + j]); | |||
| __m128i a00 = _mm_unpacklo_epi16(a0, a1); | |||
| _mm_mask_storeu_epi32((void *)boffset0, r_mask, a00); | |||
| boffset0 += 2 * remains; | |||
| } | |||
| for (; i < m; i++) { | |||
| __m128i a0 = _mm_maskz_loadu_epi16(r_mask, &a[(i + 0)*lda + j]); | |||
| _mm_mask_storeu_epi16((void *)boffset0, r_mask, a0); | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -30,6 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // Include common macros for BF16 based operations with IA intrinsics | |||
| #include "bf16_common_macros.h" | |||
| #undef STORE16_COMPLETE_RESULT | |||
| #undef STORE16_MASK_COMPLETE_RESULT | |||
| #undef STORE8_COMPLETE_RESULT | |||
| #undef STORE8_MASK_COMPLETE_RESULT | |||
| #undef STORE4_COMPLETE_RESULT | |||
| #undef STORE4_MASK_COMPLETE_RESULT | |||
| #ifndef ZERO_BETA // Beta is non-zero | |||
| #ifndef ONE_BETA // BETA is not ONE | |||
| @@ -103,7 +110,9 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3; | |||
| @@ -202,7 +211,7 @@ static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf | |||
| unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31))); | |||
| __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); | |||
| unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); | |||
| unsigned int store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); | |||
| __mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value); | |||
| accum512_0 = _mm512_setzero_ps(); | |||
| @@ -29,6 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // Include common macros for BF16 based operations with IA intrinsics | |||
| #include "bf16_common_macros.h" | |||
| #undef STORE16_COMPLETE_RESULT | |||
| #undef STORE16_MASK_COMPLETE_RESULT | |||
| #undef STORE8_COMPLETE_RESULT | |||
| #undef STORE8_MASK_COMPLETE_RESULT | |||
| #undef STORE4_COMPLETE_RESULT | |||
| #undef STORE4_MASK_COMPLETE_RESULT | |||
| #ifndef ZERO_BETA // Beta is non-zero | |||
| #ifndef ONE_BETA // BETA is not ONE | |||
| @@ -231,7 +238,9 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| unsigned char load_mask_value = (((unsigned char)0xff) >> 6); | |||
| @@ -280,7 +289,7 @@ static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| } else if (tail_num == 8) { | |||
| __m256 result256 = _mm256_setzero_ps(); | |||
| __m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]); // Load 8 rows with n=2 | |||
| __m256i matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*2]); // Load 8 rows with n=2 | |||
| __m256i xArray256 = _mm512_castsi512_si256(xArray); | |||
| result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256); | |||
| @@ -323,7 +332,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); | |||
| @@ -395,9 +406,9 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| result256_0 = _mm256_setzero_ps(); | |||
| result256_1 = _mm256_setzero_ps(); | |||
| matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element | |||
| matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element | |||
| matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element | |||
| matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element | |||
| matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element | |||
| matrixArray256_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element | |||
| matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row | |||
| matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row | |||
| @@ -423,8 +434,8 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| if (tail_num > 10) { | |||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1))); | |||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||
| matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element | |||
| matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element | |||
| matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element | |||
| matrixArray256_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element | |||
| matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]); // Load m-tag_m_32x-10 rows | |||
| matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row | |||
| @@ -439,7 +450,7 @@ static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| } else if (tail_num > 5) { | |||
| unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2))); | |||
| __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); | |||
| matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element | |||
| matrixArray256_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element | |||
| matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]); // Load m-tag_m_32x-5 rows | |||
| matrixArray256_2 = _mm256_setzero_si256(); | |||
| @@ -499,7 +510,9 @@ static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m512i M512_EPI32_1 = _mm512_set1_epi32(1); | |||
| @@ -591,7 +604,9 @@ static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m512 result_0, result_1; | |||
| @@ -782,7 +797,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m512i M512_EPI32_1 = _mm512_set1_epi32(1); | |||
| @@ -866,9 +883,9 @@ static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| result256_0 = _mm256_setzero_ps(); | |||
| matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element | |||
| matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element | |||
| matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element | |||
| matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element | |||
| matrixArray_1 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element | |||
| matrixArray_2 = _mm256_loadu_si256((__m256i *)&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element | |||
| // Process the 0|1 elements | |||
| // Select the 0|1 elements for each row | |||
| @@ -957,7 +974,9 @@ static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m512i M512_EPI32_2 = _mm512_set1_epi32(2); | |||
| @@ -1110,7 +1129,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| { | |||
| BLASLONG tag_m_16x = m & (~15); | |||
| __m128i x128 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| | |||
| __m128i x128 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7| | |||
| if (tag_m_16x > 0) { | |||
| __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; | |||
| @@ -1122,7 +1141,9 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m512i M512_EPI32_2 = _mm512_set1_epi32(2); | |||
| @@ -1214,7 +1235,7 @@ static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| __m128 result128, tmp128; | |||
| for (BLASLONG i = tag_m_16x; i < m; i++) { | |||
| result128 = _mm_setzero_ps(); | |||
| matrixArray128 = _mm_loadu_si128(&a[(i)*8]); // Load 1 rows with n=8 | |||
| matrixArray128 = _mm_loadu_si128((__m128i *)&a[(i)*8]); // Load 1 rows with n=8 | |||
| result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); | |||
| tmp128 = _mm_shuffle_ps(result128, result128, 14); | |||
| result128 = _mm_add_ps(result128, tmp128); | |||
| @@ -1258,7 +1279,7 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7); | |||
| __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); | |||
| __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| | |||
| __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7| | |||
| __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|0 |0 | 0| 0| 0| 0| 0| | |||
| if (tag_m_14x > 0) { | |||
| @@ -1271,7 +1292,9 @@ static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m256i M256_EPI16_2 = _mm256_set1_epi16(2); | |||
| @@ -1390,7 +1413,7 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x | |||
| unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3); | |||
| __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); | |||
| __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| | |||
| __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7| | |||
| __m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8)); // |x8|x9|0 | 0| 0| 0| 0| 0| | |||
| if (tag_m_12x > 0) { | |||
| @@ -1403,7 +1426,9 @@ static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m256i M256_EPI32_1 = _mm256_set1_epi32(1); | |||
| @@ -1522,7 +1547,7 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x | |||
| unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); | |||
| __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); | |||
| __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2|x3|x4|x5|x6|x7| | |||
| __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1| x2|x3|x4|x5|x6|x7| | |||
| __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10| 0| 0| 0| 0| 0| | |||
| if (tag_m_15x > 0) { | |||
| @@ -1535,7 +1560,9 @@ static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; | |||
| @@ -1690,7 +1717,7 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x | |||
| unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4); | |||
| __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); | |||
| __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2| x3|x4|x5|x6|x7| | |||
| __m128i x128_0 = _mm_loadu_si128((__m128i *)x); // |x0|x1| x2| x3|x4|x5|x6|x7| | |||
| __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10|x11| 0| 0| 0| 0| | |||
| if (tag_m_15x > 0) { | |||
| @@ -1703,7 +1730,9 @@ static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; | |||
| @@ -1873,16 +1902,15 @@ static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m512i M512_EPI32_4 = _mm512_set1_epi32(4); | |||
| __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); | |||
| __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); | |||
| unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6); | |||
| __mmask32 load_mask = *((__mmask32*) &load_mask_value); | |||
| // Prepare X with 2-step interleave way | |||
| xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); | |||
| BF16_INTERLEAVE_1x32(xArray) | |||
| @@ -2045,7 +2073,9 @@ static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m512i M512_EPI32_4 = _mm512_set1_epi32(4); | |||
| @@ -2207,16 +2237,15 @@ static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m512i M512_EPI32_4 = _mm512_set1_epi32(4); | |||
| __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); | |||
| __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); | |||
| unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2); | |||
| __mmask32 load_mask = *((__mmask32*) &load_mask_value); | |||
| // Prepare X with 2-step interleave way | |||
| xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); | |||
| BF16_INTERLEAVE_1x32(xArray) | |||
| @@ -2364,7 +2393,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x | |||
| { | |||
| BLASLONG tag_m_16x = m & (~15); | |||
| __m256i x256 = _mm256_loadu_si256(x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15| | |||
| __m256i x256 = _mm256_loadu_si256((__m256i *)x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15| | |||
| if (tag_m_16x > 0) { | |||
| __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ | |||
| @@ -2377,7 +2406,9 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m512i M512_EPI32_4 = _mm512_set1_epi32(4); | |||
| @@ -2484,7 +2515,7 @@ static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x | |||
| __m128 accum128, tmp128; | |||
| for (BLASLONG i = tag_m_16x; i < m; i++) { | |||
| accum256 = _mm256_setzero_ps(); | |||
| matrixArray256 = _mm256_loadu_si256(&a[(i)*16]); // Load 1 rows with n=16 | |||
| matrixArray256 = _mm256_loadu_si256((__m256i *)&a[(i)*16]); // Load 1 rows with n=16 | |||
| accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); | |||
| accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); | |||
| tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); | |||
| @@ -2535,7 +2566,9 @@ static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ | |||
| @@ -2647,8 +2680,6 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b | |||
| BLASLONG tag_n_32x = n & (~31); | |||
| BLASLONG tag_n_128x = n & (~127); | |||
| __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \ | |||
| accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15; | |||
| __m512 accum512_bridge[8]; | |||
| __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3; | |||
| __m256 accum256_0; | |||
| @@ -2658,7 +2689,9 @@ static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, b | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; | |||
| @@ -2825,7 +2858,9 @@ static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bf | |||
| __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_set1_ps(beta); | |||
| #endif | |||
| #endif | |||
| __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; | |||
| @@ -2961,7 +2996,9 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 | |||
| __m512 ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha)); | |||
| #endif | |||
| #ifndef ZERO_BETA | |||
| #ifndef ONE_BETA | |||
| __m512 BETAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(beta)); | |||
| #endif | |||
| #endif | |||
| __m256 accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \ | |||
| @@ -3012,7 +3049,7 @@ static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 | |||
| __m128 accum128, tmp128; | |||
| for (BLASLONG i = tag_m_8x; i < m; i++) { | |||
| accum256_0 = _mm256_setzero_ps(); | |||
| matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]); // Load 1 rows with n=16 | |||
| matrixArray_0 = _mm256_loadu_si256((__m256i *)&a[(i)*lda]); // Load 1 rows with n=16 | |||
| accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256); | |||
| accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); | |||
| tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); | |||
| @@ -41,7 +41,7 @@ | |||
| #include <immintrin.h> | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, | |||
| FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, | |||
| IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, | |||
| FLOAT *c, BLASLONG ldc){ | |||
| BLASLONG i, j; | |||
| @@ -115,6 +115,8 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT | |||
| #endif | |||
| #ifndef HAVE_SGEMV_N_SKYLAKE_KERNEL | |||
| #ifndef HAVE_KERNEL_4x2 | |||
| static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| @@ -170,6 +172,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| } | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x1 | |||
| @@ -16,7 +16,7 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) | |||
| if (n2 < 32) { | |||
| __m128d accum_10, accum_11, accum_12, accum_13; | |||
| __m128d abs_mask1; | |||
| __m128d abs_mask1 = abs_mask1; | |||
| accum_10 = _mm_setzero_pd(); | |||
| accum_11 = _mm_setzero_pd(); | |||
| @@ -351,7 +351,7 @@ | |||
| * | |||
| * Quick return if possible | |||
| * | |||
| IF( N.LE.0 ) THEN | |||
| IF( (N.LE.0) .OR. (M.LE.0) ) THEN | |||
| RETURN | |||
| END IF | |||
| * | |||
| @@ -353,7 +353,7 @@ | |||
| * | |||
| * Quick return if possible | |||
| * | |||
| IF( N.LE.0 ) THEN | |||
| IF( (N.LE.0).OR.(M.LE.0) ) THEN | |||
| RETURN | |||
| END IF | |||
| * | |||
| @@ -353,7 +353,7 @@ | |||
| * | |||
| * Quick return if possible | |||
| * | |||
| IF( N.LE.0 ) THEN | |||
| IF( (N.LE.0).OR.(M.LE.0) ) THEN | |||
| RETURN | |||
| END IF | |||
| * | |||
| @@ -351,7 +351,7 @@ | |||
| * | |||
| * Quick return if possible | |||
| * | |||
| IF( N.LE.0 ) THEN | |||
| IF( (N.LE.0).OR.(M.LE.0) ) THEN | |||
| RETURN | |||
| END IF | |||
| * | |||
| @@ -1771,6 +1771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #define USE_SGEMM_KERNEL_DIRECT 1 | |||
| #undef SBGEMM_DEFAULT_UNROLL_N | |||
| #undef SBGEMM_DEFAULT_UNROLL_M | |||
| #undef SBGEMM_DEFAULT_P | |||
| #undef SBGEMM_DEFAULT_R | |||
| #undef SBGEMM_DEFAULT_Q | |||
| #define SBGEMM_DEFAULT_UNROLL_N 4 | |||
| #define SBGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SBGEMM_DEFAULT_P 384 | |||
| #define SBGEMM_DEFAULT_Q 768 | |||
| #define SBGEMM_DEFAULT_R sbgemm_r | |||
| #ifdef ARCH_X86 | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| @@ -2454,13 +2465,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| #if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) | |||
| #define DGEMM_DEFAULT_UNROLL_M 16 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #else | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 8 | |||
| #endif | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 8 | |||