| @@ -2,6 +2,9 @@ name: continuous build | |||
| on: [push, pull_request] | |||
| permissions: | |||
| contents: read # to fetch code (actions/checkout) | |||
| jobs: | |||
| build: | |||
| runs-on: ${{ matrix.os }} | |||
| @@ -150,6 +153,7 @@ jobs: | |||
| matrix: | |||
| msystem: [MINGW64, MINGW32, CLANG64] | |||
| idx: [int32, int64] | |||
| build-type: [Release] | |||
| include: | |||
| - msystem: MINGW64 | |||
| idx: int32 | |||
| @@ -173,6 +177,11 @@ jobs: | |||
| idx64-flags: -DBINARY=64 -DINTERFACE64=1 | |||
| target-prefix: mingw-w64-clang-x86_64 | |||
| c-lapack-flags: -DC_LAPACK=ON | |||
| - msystem: MINGW64 | |||
| idx: int32 | |||
| target-prefix: mingw-w64-x86_64 | |||
| fc-pkg: mingw-w64-x86_64-gcc-fortran | |||
| build-type: None | |||
| exclude: | |||
| - msystem: MINGW32 | |||
| idx: int64 | |||
| @@ -215,11 +224,11 @@ jobs: | |||
| path: C:/msys64/home/runneradmin/.ccache | |||
| # We include the commit sha in the cache key, as new cache entries are | |||
| # only created if there is no existing entry for the key yet. | |||
| key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }}-${{ github.sha }} | |||
| key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }} | |||
| # Restore a matching ccache cache entry. Prefer same branch. | |||
| restore-keys: | | |||
| ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }} | |||
| ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }} | |||
| ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }} | |||
| ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }} | |||
| - name: Configure ccache | |||
| # Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota. | |||
| @@ -235,7 +244,8 @@ jobs: | |||
| - name: Configure OpenBLAS | |||
| run: | | |||
| mkdir build && cd build | |||
| cmake -DBUILD_SHARED_LIBS=ON \ | |||
| cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \ | |||
| -DBUILD_SHARED_LIBS=ON \ | |||
| -DBUILD_STATIC_LIBS=ON \ | |||
| -DDYNAMIC_ARCH=ON \ | |||
| -DUSE_THREAD=ON \ | |||
| @@ -258,6 +268,7 @@ jobs: | |||
| timeout-minutes: 60 | |||
| run: cd build && ctest | |||
| cross_build: | |||
| runs-on: ubuntu-22.04 | |||
| @@ -267,7 +278,7 @@ jobs: | |||
| include: | |||
| - target: mips64el | |||
| triple: mips64el-linux-gnuabi64 | |||
| opts: DYNAMIC_ARCH=1 | |||
| opts: DYNAMIC_ARCH=1 TARGET=GENERIC | |||
| - target: riscv64 | |||
| triple: riscv64-linux-gnu | |||
| opts: TARGET=RISCV64_GENERIC | |||
| @@ -0,0 +1,117 @@ | |||
| name: mips64 qemu test | |||
| on: [push, pull_request] | |||
| permissions: | |||
| contents: read # to fetch code (actions/checkout) | |||
| jobs: | |||
| TEST: | |||
| runs-on: ubuntu-latest | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| include: | |||
| - target: MIPS64_GENERIC | |||
| triple: mips64el-linux-gnuabi64 | |||
| opts: NO_SHARED=1 TARGET=MIPS64_GENERIC | |||
| - target: SICORTEX | |||
| triple: mips64el-linux-gnuabi64 | |||
| opts: NO_SHARED=1 TARGET=SICORTEX | |||
| - target: I6400 | |||
| triple: mipsisa64r6el-linux-gnuabi64 | |||
| opts: NO_SHARED=1 TARGET=I6400 | |||
| - target: P6600 | |||
| triple: mipsisa64r6el-linux-gnuabi64 | |||
| opts: NO_SHARED=1 TARGET=P6600 | |||
| - target: I6500 | |||
| triple: mipsisa64r6el-linux-gnuabi64 | |||
| opts: NO_SHARED=1 TARGET=I6500 | |||
| steps: | |||
| - name: Checkout repository | |||
| uses: actions/checkout@v3 | |||
| - name: install build deps | |||
| run: | | |||
| sudo apt-get update | |||
| sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ | |||
| gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross | |||
| - name: checkout qemu | |||
| uses: actions/checkout@v3 | |||
| with: | |||
| repository: qemu/qemu | |||
| path: qemu | |||
| ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2 | |||
| - name: build qemu | |||
| run: | | |||
| cd qemu | |||
| ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system | |||
| make -j$(nproc) | |||
| make install | |||
| - name: Compilation cache | |||
| uses: actions/cache@v3 | |||
| with: | |||
| path: ~/.ccache | |||
| key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} | |||
| restore-keys: | | |||
| ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} | |||
| ccache-${{ runner.os }}-${{ matrix.target }} | |||
| - name: Configure ccache | |||
| run: | | |||
| test -d ~/.ccache || mkdir -p ~/.ccache | |||
| echo "max_size = 300M" > ~/.ccache/ccache.conf | |||
| echo "compression = true" >> ~/.ccache/ccache.conf | |||
| ccache -s | |||
| - name: build OpenBLAS | |||
| run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) | |||
| - name: test | |||
| run: | | |||
| export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH | |||
| qemu-mips64el ./utest/openblas_utest | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat2 < ./ctest/sin2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat2 < ./ctest/din2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat2 < ./ctest/cin2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat2 < ./ctest/zin2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat3 < ./ctest/sin3 | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat3 < ./ctest/din3 | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat3 < ./ctest/cin3 | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat3 < ./ctest/zin3 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat1 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat1 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat1 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat1 | |||
| rm -f ./test/?BLAT2.SUMM | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat | |||
| rm -f ./test/?BLAT2.SUMM | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat | |||
| rm -f ./test/?BLAT3.SUMM | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat | |||
| rm -f ./test/?BLAT3.SUMM | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat | |||
| @@ -17,6 +17,10 @@ on: | |||
| # it only makes sense to test if this file has been changed | |||
| name: Nightly-Homebrew-Build | |||
| permissions: | |||
| contents: read # to fetch code (actions/checkout) | |||
| jobs: | |||
| build-OpenBLAS-with-Homebrew: | |||
| runs-on: macos-latest | |||
| @@ -30,7 +30,7 @@ matrix: | |||
| before_script: &common-before | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" | |||
| script: | |||
| - travis_wait 40 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| - travis_wait 50 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| - make -C test $COMMON_FLAGS $BTYPE | |||
| - make -C ctest $COMMON_FLAGS $BTYPE | |||
| - make -C utest $COMMON_FLAGS $BTYPE | |||
| @@ -104,7 +104,7 @@ matrix: | |||
| - sudo apt-get update | |||
| - sudo apt-get install gcc-9 gfortran-9 -y | |||
| script: | |||
| - travis_wait 40 make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 | |||
| - travis_wait 50 make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 | |||
| - make -C test $COMMON_FLAGS $BTYPE | |||
| - make -C ctest $COMMON_FLAGS $BTYPE | |||
| - make -C utest $COMMON_FLAGS $BTYPE | |||
| @@ -121,7 +121,7 @@ matrix: | |||
| - sudo apt-get update | |||
| - sudo apt-get install gcc-9 gfortran-9 -y | |||
| script: | |||
| - travis_wait 40 make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 | |||
| - travis_wait 50 make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 | |||
| - make -C test $COMMON_FLAGS $BTYPE | |||
| - make -C ctest $COMMON_FLAGS $BTYPE | |||
| - make -C utest $COMMON_FLAGS $BTYPE | |||
| @@ -212,10 +212,10 @@ if(NOT NO_LAPACKE) | |||
| add_library(LAPACKE OBJECT ${LAPACKE_SOURCES}) | |||
| list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACKE>") | |||
| endif() | |||
| if(BUILD_RELAPACK) | |||
| add_library(RELAPACK OBJECT ${RELA_SOURCES}) | |||
| list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>") | |||
| endif() | |||
| #if(BUILD_RELAPACK) | |||
| # add_library(RELAPACK OBJECT ${RELA_SOURCES}) | |||
| # list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>") | |||
| #endif() | |||
| set(OpenBLAS_LIBS "") | |||
| if(BUILD_STATIC_LIBS) | |||
| add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
| @@ -211,4 +211,5 @@ In chronological order: | |||
| * PLCT Lab, Institute of Software Chinese Academy of Sciences | |||
| * [2022-03] Support RISC-V Vector Intrinisc 1.0 version. | |||
| * Pablo Romero <https://github.com/pablorcum> | |||
| * [2022-08] Fix building from sources for QNX | |||
| @@ -278,7 +278,11 @@ prof_lapack : lapack_prebuild | |||
| lapack_prebuild : | |||
| ifeq ($(NO_LAPACK), $(filter 0,$(NO_LAPACK))) | |||
| -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| -@echo "override FFLAGS = $(LAPACK_FFLAGS) -fno-tree-vectorize" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| else | |||
| -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| endif | |||
| -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| @@ -70,12 +70,12 @@ endif | |||
| ifeq ($(CORE), NEOVERSEN1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 | |||
| CCOMMON_OPT += -march=armv8.2-a+sve -mtune=neoverse-n1 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| endif | |||
| @@ -89,17 +89,17 @@ endif | |||
| endif | |||
| # Use a72 tunings because Neoverse-V1 is only available | |||
| # in GCC>=9.4 | |||
| # in GCC>=10.4 | |||
| ifeq ($(CORE), NEOVERSEV1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) | |||
| CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11))) | |||
| CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.4-a -mtune=native | |||
| CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a -mtune=native | |||
| endif | |||
| @@ -119,17 +119,21 @@ endif | |||
| endif | |||
| # Use a72 tunings because Neoverse-N2 is only available | |||
| # in GCC>=9.4 | |||
| # in GCC>=10.4 | |||
| ifeq ($(CORE), NEOVERSEN2) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11))) | |||
| ifneq ($(OSNAME), Darwin) | |||
| CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 | |||
| else | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| endif | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.5-a -mtune=native | |||
| CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.5-a -mtune=native | |||
| endif | |||
| @@ -14,6 +14,11 @@ OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake | |||
| OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake | |||
| OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig | |||
| PKG_EXTRALIB := $(EXTRALIB) | |||
| ifeq ($(INTERFACE64),1) | |||
| SUFFIX64=64 | |||
| endif | |||
| PKGFILE="$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE)$(SUFFIX64).pc" | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(C_COMPILER), PGI) | |||
| PKG_EXTRALIB += -lomp | |||
| @@ -150,13 +155,19 @@ endif | |||
| endif | |||
| #Generating openblas.pc | |||
| @echo Generating $(LIBSONAMEBASE).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" | |||
| @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" | |||
| @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" | |||
| @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" | |||
| @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" | |||
| @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" | |||
| @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" | |||
| ifeq ($(INTERFACE64),1) | |||
| SUFFIX64=64 | |||
| endif | |||
| PKGFILE="$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE)$(SUFFIX64).pc" | |||
| @echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" | |||
| @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)" | |||
| @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" | |||
| @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" | |||
| @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" | |||
| @echo 'version='$(VERSION) >> "$(PKGFILE)" | |||
| @echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)" | |||
| @cat openblas.pc.in >> "$(PKGFILE)" | |||
| #Generating OpenBLASConfig.cmake | |||
| @@ -60,9 +60,9 @@ all: getarch_2nd | |||
| ./getarch_2nd 1 >> $(TARGET_CONF) | |||
| $(TARGET_CONF): c_check$(SCRIPTSUFFIX) f_check$(SCRIPTSUFFIX) getarch | |||
| ./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(CC)" "$(TARGET_FLAGS) $(CFLAGS)" | |||
| ./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(CC)" $(TARGET_FLAGS) $(CFLAGS) | |||
| ifneq ($(ONLY_CBLAS), 1) | |||
| ./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(FC)" "$(TARGET_FLAGS)" | |||
| ./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(FC)" $(TARGET_FLAGS) | |||
| else | |||
| #When we only build CBLAS, we set NOFORTRAN=2 | |||
| echo "NOFORTRAN=2" >> $(TARGET_MAKE) | |||
| @@ -77,8 +77,8 @@ endif | |||
| getarch : getarch.c cpuid.S dummy $(CPUIDEMU) | |||
| avx512=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" "$(TARGET_FLAGS) $(CFLAGS)" | grep NO_AVX512); \ | |||
| rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" "$(TARGET_FLAGS) $(CFLAGS)" | grep NO_RV64GV); \ | |||
| avx512=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \ | |||
| rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" $(TARGET_FLAGS) $(CFLAGS) | grep NO_RV64GV); \ | |||
| $(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} $${rv64gv:+-D$${rv64gv}} -o $(@F) getarch.c cpuid.S $(CPUIDEMU) | |||
| getarch_2nd : getarch_2nd.c $(TARGET_CONF) dummy | |||
| @@ -131,6 +131,9 @@ BUILD_LAPACK_DEPRECATED = 1 | |||
| # Build RecursiveLAPACK on top of LAPACK | |||
| # BUILD_RELAPACK = 1 | |||
| # Have RecursiveLAPACK actually replace standard LAPACK routines instead of | |||
| # just adding its equivalents with a RELAPACK_ prefix | |||
| # RELAPACK_REPLACE = 1 | |||
| # If you want to use the legacy threaded Level 3 implementation. | |||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| @@ -207,7 +210,7 @@ NO_AFFINITY = 1 | |||
| # to the user space. If bigphysarea is enabled, it will use it. | |||
| # DEVICEDRIVER_ALLOCATION = 1 | |||
| # If you need to synchronize FP CSR between threads (for x86/x86_64 only). | |||
| # If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only). | |||
| # CONSISTENT_FPCSR = 1 | |||
| # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute | |||
| @@ -9,6 +9,10 @@ ifndef TOPDIR | |||
| TOPDIR = . | |||
| endif | |||
| ifndef RELAPACK_REPLACE | |||
| RELAPACK_REPLACE=0 | |||
| endif | |||
| # we need to use the host system's architecture for getarch compile options even especially when cross-compiling | |||
| HOSTARCH := $(shell uname -m) | |||
| ifeq ($(HOSTARCH), amd64) | |||
| @@ -143,6 +143,7 @@ ifeq ($(C_COMPILER), CLANG) | |||
| CCOMMON_OPT += -mavx2 | |||
| endif | |||
| endif | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| # AVX2 support was added in 4.7.0 | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| @@ -159,6 +160,7 @@ endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -141,7 +141,7 @@ jobs: | |||
| - job: OSX_OpenMP | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| vmImage: 'macOS-11' | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| @@ -151,15 +151,23 @@ jobs: | |||
| - job: OSX_GCC_Nothreads | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| vmImage: 'macOS-11' | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| make USE_THREADS=0 CC=gcc-10 FC=gfortran-10 | |||
| - job: OSX_GCC12 | |||
| pool: | |||
| vmImage: 'macOS-latest' | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| make CC=gcc-12 FC=gfortran-12 | |||
| - job: OSX_OpenMP_Clang | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| vmImage: 'macOS-11' | |||
| variables: | |||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| @@ -172,7 +180,7 @@ jobs: | |||
| - job: OSX_OpenMP_Clang_cmake | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| vmImage: 'macOS-11' | |||
| variables: | |||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| @@ -188,7 +196,7 @@ jobs: | |||
| - job: OSX_dynarch_cmake | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| vmImage: 'macOS-11' | |||
| variables: | |||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| @@ -196,13 +204,13 @@ jobs: | |||
| - script: | | |||
| mkdir build | |||
| cd build | |||
| cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. | |||
| cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST='NEHALEM HASWELL SKYLAKEX' -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. | |||
| cmake --build . | |||
| ctest | |||
| - job: OSX_Ifort_Clang | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| vmImage: 'macOS-11' | |||
| variables: | |||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg | |||
| @@ -235,7 +243,7 @@ jobs: | |||
| - job: OSX_NDK_ARMV7 | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| vmImage: 'macOS-11' | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| @@ -255,7 +263,7 @@ jobs: | |||
| - job: OSX_IOS_ARMV7 | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| vmImage: 'macOS-11' | |||
| variables: | |||
| CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1 | |||
| @@ -1,133 +1,133 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef AMAX | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define AMAX BLASFUNC(dzamax) | |||
| #else | |||
| #define AMAX BLASFUNC(scamax) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define AMAX BLASFUNC(damax) | |||
| #else | |||
| #define AMAX BLASFUNC(samax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x = 1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) | |||
| { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) | |||
| { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) | |||
| { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) | |||
| { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| AMAX(&m, x, &inc_x); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef AMAX | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define AMAX BLASFUNC(dzamax) | |||
| #else | |||
| #define AMAX BLASFUNC(scamax) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define AMAX BLASFUNC(damax) | |||
| #else | |||
| #define AMAX BLASFUNC(samax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x = 1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) | |||
| { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) | |||
| { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) | |||
| { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) | |||
| { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| AMAX(&m, x, &inc_x); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,137 +1,137 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef AMIN | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define AMIN BLASFUNC(dzamin) | |||
| #else | |||
| #define AMIN BLASFUNC(scamin) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define AMIN BLASFUNC(damin) | |||
| #else | |||
| #define AMIN BLASFUNC(samin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x = 1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) | |||
| { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) | |||
| { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) | |||
| { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) | |||
| { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| AMIN(&m, x, &inc_x); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef AMIN | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define AMIN BLASFUNC(dzamin) | |||
| #else | |||
| #define AMIN BLASFUNC(scamin) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define AMIN BLASFUNC(damin) | |||
| #else | |||
| #define AMIN BLASFUNC(samin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x = 1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) | |||
| { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) | |||
| { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) | |||
| { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) | |||
| { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| AMIN(&m, x, &inc_x); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -74,6 +74,24 @@ static void *huge_malloc(BLASLONG size){ | |||
| #endif | |||
| /* Benchmarks should allocate with cacheline (often 64 bytes) alignment | |||
| to avoid unreliable results. This technique, storing the allocated | |||
| pointer value just before the aligned memory, doesn't require | |||
| C11's aligned_alloc for compatibility with older compilers. */ | |||
| static void *aligned_alloc_cacheline(size_t n) | |||
| { | |||
| void *p = malloc((size_t)(void *) + n + L1_DATA_LINESIZE - 1); | |||
| if (p) { | |||
| void **newp = (void **) | |||
| (((uintptr_t)p + L1_DATA_LINESIZE) & (uintptr_t)-L1_DATA_LINESIZE); | |||
| newp[-1] = p; | |||
| p = newp; | |||
| } | |||
| return p; | |||
| } | |||
| #define malloc aligned_alloc_cacheline | |||
| #define free(p) free((p) ? ((void **)(p))[-1] : (p)) | |||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | |||
| struct timeval start, stop; | |||
| #elif defined(__APPLE__) | |||
| @@ -1,134 +1,134 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef HBMV | |||
| #ifdef DOUBLE | |||
| #define HBMV BLASFUNC(zhbmv) | |||
| #else | |||
| #define HBMV BLASFUNC(chbmv) | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {0.0, 0.0}; | |||
| blasint k = 1; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1, inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| if ((p = getenv("OPENBLAS_K"))) k = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", | |||
| from, to, step, uplo, k, inc_x, inc_y, loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m, (int)m); | |||
| for(j = 0; j < m; j++) { | |||
| for(i = 0; i < m * COMPSIZE; i++) { | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef HBMV | |||
| #ifdef DOUBLE | |||
| #define HBMV BLASFUNC(zhbmv) | |||
| #else | |||
| #define HBMV BLASFUNC(chbmv) | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {0.0, 0.0}; | |||
| blasint k = 1; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1, inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| if ((p = getenv("OPENBLAS_K"))) k = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", | |||
| from, to, step, uplo, k, inc_x, inc_y, loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m, (int)m); | |||
| for(j = 0; j < m; j++) { | |||
| for(i = 0; i < m * COMPSIZE; i++) { | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,133 +1,133 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef HPMV | |||
| #ifdef DOUBLE | |||
| #define HPMV BLASFUNC(zhpmv) | |||
| #else | |||
| #define HPMV BLASFUNC(chpmv) | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1, inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m, (int)m); | |||
| for(j = 0; j < m; j++) { | |||
| for(i = 0; i < m * COMPSIZE; i++) { | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef HPMV | |||
| #ifdef DOUBLE | |||
| #define HPMV BLASFUNC(zhpmv) | |||
| #else | |||
| #define HPMV BLASFUNC(chpmv) | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1, inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m, (int)m); | |||
| for(j = 0; j < m; j++) { | |||
| for(i = 0; i < m * COMPSIZE; i++) { | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,120 +1,120 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IAMIN | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IAMIN BLASFUNC(izamin) | |||
| #else | |||
| #define IAMIN BLASFUNC(icamin) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define IAMIN BLASFUNC(idamin) | |||
| #else | |||
| #define IAMIN BLASFUNC(isamin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IAMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IAMIN | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IAMIN BLASFUNC(izamin) | |||
| #else | |||
| #define IAMIN BLASFUNC(icamin) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define IAMIN BLASFUNC(idamin) | |||
| #else | |||
| #define IAMIN BLASFUNC(isamin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IAMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,114 +1,114 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IMAX | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IMAX BLASFUNC(idmax) | |||
| #else | |||
| #define IMAX BLASFUNC(ismax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IMAX (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IMAX | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IMAX BLASFUNC(idmax) | |||
| #else | |||
| #define IMAX BLASFUNC(ismax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IMAX (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,114 +1,114 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IMIN | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IMIN BLASFUNC(idmin) | |||
| #else | |||
| #define IMIN BLASFUNC(ismin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IMIN | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IMIN BLASFUNC(idmin) | |||
| #else | |||
| #define IMIN BLASFUNC(ismin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,113 +1,113 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef NAMAX | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define NAMAX BLASFUNC(dmax) | |||
| #else | |||
| #define NAMAX BLASFUNC(smax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| NAMAX (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef NAMAX | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define NAMAX BLASFUNC(dmax) | |||
| #else | |||
| #define NAMAX BLASFUNC(smax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| NAMAX (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,113 +1,113 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef NAMIN | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define NAMIN BLASFUNC(dmin) | |||
| #else | |||
| #define NAMIN BLASFUNC(smin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| NAMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef NAMIN | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define NAMIN BLASFUNC(dmin) | |||
| #else | |||
| #define NAMIN BLASFUNC(smin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| NAMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,138 +1,138 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef ROTM | |||
| #ifdef DOUBLE | |||
| #define ROTM BLASFUNC(drotm) | |||
| #else | |||
| #define ROTM BLASFUNC(srotm) | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x, *y; | |||
| // FLOAT result; | |||
| blasint m, i; | |||
| blasint inc_x = 1, inc_y = 1; | |||
| FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0}; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) | |||
| inc_y = atoi(p); | |||
| fprintf( | |||
| stderr, | |||
| "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", | |||
| from, to, step, inc_x, inc_y, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == | |||
| NULL) { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == | |||
| NULL) { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| begin(); | |||
| ROTM(&m, x, &inc_x, y, &inc_y, param); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef ROTM | |||
| #ifdef DOUBLE | |||
| #define ROTM BLASFUNC(drotm) | |||
| #else | |||
| #define ROTM BLASFUNC(srotm) | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x, *y; | |||
| // FLOAT result; | |||
| blasint m, i; | |||
| blasint inc_x = 1, inc_y = 1; | |||
| FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0}; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) | |||
| inc_y = atoi(p); | |||
| fprintf( | |||
| stderr, | |||
| "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", | |||
| from, to, step, inc_x, inc_y, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == | |||
| NULL) { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == | |||
| NULL) { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| begin(); | |||
| ROTM(&m, x, &inc_x, y, &inc_y, param); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x, *y; | |||
| FLOAT *x; | |||
| FLOAT alpha[2] = { 2.0, 2.0 }; | |||
| blasint m, i; | |||
| blasint inc_x=1,inc_y=1; | |||
| @@ -74,10 +74,6 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| @@ -91,30 +87,20 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| SCAL (&m, alpha, x, &inc_x); | |||
| } | |||
| end(); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| time1 = getsec(); | |||
| timeg /= loops; | |||
| timeg = time1 / loops; | |||
| #ifdef COMPLEX | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 6. * (double)m / timeg * 1.e-6, timeg); | |||
| @@ -1,146 +1,146 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef SPMV | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define SPMV BLASFUNC(dspmv) | |||
| #else | |||
| #define SPMV BLASFUNC(sspmv) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define SPMV BLASFUNC(zspmv) | |||
| #else | |||
| #define SPMV BLASFUNC(cspmv) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1,inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m,(int)m); | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < m * COMPSIZE; i++){ | |||
| a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef SPMV | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define SPMV BLASFUNC(dspmv) | |||
| #else | |||
| #define SPMV BLASFUNC(sspmv) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define SPMV BLASFUNC(zspmv) | |||
| #else | |||
| #define SPMV BLASFUNC(cspmv) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1,inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m,(int)m); | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < m * COMPSIZE; i++){ | |||
| a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -155,6 +155,39 @@ if (${CORE} STREQUAL A64FX) | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL NEOVERSEN2) | |||
| if (NOT DYNAMIC_ARCH) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| endif() | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL NEOVERSEV1) | |||
| if (NOT DYNAMIC_ARCH) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| endif() | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL NEOVERSEN1) | |||
| if (NOT DYNAMIC_ARCH) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| endif() | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL ARMV8SVE) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| @@ -318,6 +318,8 @@ set(CSRC | |||
| lapacke_clacn2.c | |||
| lapacke_clag2z.c | |||
| lapacke_clag2z_work.c | |||
| lapacke_clangb.c | |||
| lapacke_clangb_work.c | |||
| lapacke_clange.c | |||
| lapacke_clange_work.c | |||
| lapacke_clanhe.c | |||
| @@ -803,6 +805,8 @@ set(DSRC | |||
| lapacke_dlag2s_work.c | |||
| lapacke_dlamch.c | |||
| lapacke_dlamch_work.c | |||
| lapacke_dlangb.c | |||
| lapacke_dlangb_work.c | |||
| lapacke_dlange.c | |||
| lapacke_dlange_work.c | |||
| lapacke_dlansy.c | |||
| @@ -1381,6 +1385,8 @@ set(SSRC | |||
| lapacke_slag2d_work.c | |||
| lapacke_slamch.c | |||
| lapacke_slamch_work.c | |||
| lapacke_slangb.c | |||
| lapacke_slangb_work.c | |||
| lapacke_slange.c | |||
| lapacke_slange_work.c | |||
| lapacke_slansy.c | |||
| @@ -2089,6 +2095,8 @@ set(ZSRC | |||
| lapacke_zlacrm_work.c | |||
| lapacke_zlag2c.c | |||
| lapacke_zlag2c_work.c | |||
| lapacke_zlangb.c | |||
| lapacke_zlangb_work.c | |||
| lapacke_zlange.c | |||
| lapacke_zlange_work.c | |||
| lapacke_zlanhe.c | |||
| @@ -2481,6 +2489,8 @@ set(Utils_SRC | |||
| lapacke_ctp_nancheck.c lapacke_dtr_trans.c lapacke_str_trans.c lapacke_ztp_trans.c | |||
| lapacke_ctp_trans.c lapacke_lsame.c lapacke_xerbla.c lapacke_ztr_nancheck.c | |||
| lapacke_ctr_nancheck.c lapacke_make_complex_double.c lapacke_z_nancheck.c lapacke_ztr_trans.c | |||
| lapacke_ctz_nancheck.c lapacke_ctz_trans.c lapacke_dtz_nancheck.c lapacke_dtz_trans.c | |||
| lapacke_stz_nancheck.c lapacke_stz_trans.c lapacke_ztz_nancheck.c lapacke_ztz_trans.c | |||
| ) | |||
| set(LAPACKE_REL_SRC "") | |||
| @@ -2,7 +2,7 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@ | |||
| libsuffix=@SUFFIX64_UNDERSCORE@ | |||
| includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ | |||
| openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ | |||
| openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ | |||
| Name: OpenBLAS | |||
| Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version | |||
| Version: @OPENBLAS_VERSION@ | |||
| @@ -197,14 +197,14 @@ if (DEFINED TARGET) | |||
| if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2) | |||
| if ((${TARGET} STREQUAL HASWELL OR ${TARGET} STREQUAL ZEN) AND NOT NO_AVX2) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
| endif() | |||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2 -mfma") | |||
| endif() | |||
| endif() | |||
| if (DEFINED HAVE_AVX) | |||
| @@ -90,7 +90,7 @@ extern "C" { | |||
| #endif | |||
| #include <time.h> | |||
| #ifdef OS_LINUX | |||
| #if defined(OS_LINUX) || defined(OS_QNX) | |||
| #include <malloc.h> | |||
| #include <sched.h> | |||
| #endif | |||
| @@ -107,7 +107,7 @@ extern "C" { | |||
| #endif | |||
| #endif | |||
| #ifdef OS_HAIKU | |||
| #if defined(OS_HAIKU) || defined(OS_QNX) | |||
| #define NO_SYSV_IPC | |||
| #endif | |||
| @@ -387,6 +387,10 @@ typedef int blasint; | |||
| #endif | |||
| */ | |||
| #ifdef __EMSCRIPTEN__ | |||
| #define YIELDING | |||
| #endif | |||
| #ifndef YIELDING | |||
| #define YIELDING sched_yield() | |||
| #endif | |||
| @@ -50,6 +50,7 @@ typedef struct { | |||
| #ifdef BUILD_BFLOAT16 | |||
| int sbgemm_p, sbgemm_q, sbgemm_r; | |||
| int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; | |||
| int sbgemm_align_k; | |||
| void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); | |||
| void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); | |||
| @@ -1544,6 +1544,17 @@ int get_cpuname(void){ | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 11: //family 6 exmodel 11 | |||
| switch (model) { | |||
| case 7: // Raptor Lake | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| } | |||
| break; | |||
| case 0x7: | |||
| @@ -2334,6 +2345,18 @@ int get_coretype(void){ | |||
| return CORE_NEHALEM; | |||
| } | |||
| case 11: | |||
| switch (model) { | |||
| case 7: // Raptor Lake | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| #endif | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| case 15: | |||
| if (model <= 0x2) return CORE_NORTHWOOD; | |||
| else return CORE_PRESCOTT; | |||
| @@ -173,3 +173,8 @@ HAVE_C11 | |||
| ARCH_E2K | |||
| #endif | |||
| #if defined(__EMSCRIPTEN__) | |||
| ARCH_RISCV64 | |||
| OS_WINDOWS | |||
| #endif | |||
| @@ -40,7 +40,7 @@ else() | |||
| c_${float_char}blas1.c) | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat1 m) | |||
| endif() | |||
| add_test(NAME "x${float_char}cblat1" | |||
| @@ -65,7 +65,7 @@ else() | |||
| constant.c) | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat2 m) | |||
| endif() | |||
| add_test(NAME "x${float_char}cblat2" | |||
| @@ -90,7 +90,7 @@ else() | |||
| constant.c) | |||
| endif() | |||
| target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD") | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") | |||
| target_link_libraries(x${float_char}cblat3 m) | |||
| endif() | |||
| add_test(NAME "x${float_char}cblat3" | |||
| @@ -969,7 +969,7 @@ real *sfac; | |||
| 1.17 }; | |||
| /* Local variables */ | |||
| extern /* Subroutine */ srottest_(); | |||
| extern /* Subroutine */ void srottest_(); | |||
| static integer i__, k, ksize; | |||
| extern /* Subroutine */ int stest_(), srotmtest_(); | |||
| static integer ki, kn; | |||
| @@ -304,6 +304,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M; | |||
| } | |||
| BLASLONG pad_min_l = min_l; | |||
| #if defined(HALF) | |||
| #if defined(DYNAMIC_ARCH) | |||
| pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); | |||
| #else | |||
| pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; | |||
| #endif | |||
| #endif | |||
| /* First, we have to move data A to L2 cache */ | |||
| min_i = m_to - m_from; | |||
| l1stride = 1; | |||
| @@ -350,7 +359,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| START_RPCC(); | |||
| OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, | |||
| sb + min_l * (jjs - js) * COMPSIZE * l1stride); | |||
| sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride); | |||
| STOP_RPCC(outercost); | |||
| @@ -358,10 +367,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) | |||
| KERNEL_OPERATION(min_i, min_jj, min_l, alpha, | |||
| sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); | |||
| sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); | |||
| #else | |||
| KERNEL_OPERATION(min_i, min_jj, min_l, (void *)&xalpha, | |||
| sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); | |||
| sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); | |||
| #endif | |||
| STOP_RPCC(kernelcost); | |||
| @@ -324,6 +324,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| } else { | |||
| if (min_l > GEMM_Q) min_l = (min_l + 1) / 2; | |||
| } | |||
| BLASLONG pad_min_l = min_l; | |||
| #if defined(HALF) | |||
| #if defined(DYNAMIC_ARCH) | |||
| pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); | |||
| #else | |||
| pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; | |||
| #endif | |||
| #endif | |||
| /* Determine step size in m | |||
| * Note: We are currently on the first step in m | |||
| @@ -382,13 +392,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Copy part of local region of B into workspace */ | |||
| START_RPCC(); | |||
| OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, | |||
| buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride); | |||
| buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride); | |||
| STOP_RPCC(copy_B); | |||
| /* Apply kernel with local region of A and part of local region of B */ | |||
| START_RPCC(); | |||
| KERNEL_OPERATION(min_i, min_jj, min_l, alpha, | |||
| sa, buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride, | |||
| sa, buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride, | |||
| c, ldc, m_from, jjs); | |||
| STOP_RPCC(kernel); | |||
| @@ -470,9 +470,13 @@ blas_queue_t *tscq; | |||
| #endif | |||
| #ifdef CONSISTENT_FPCSR | |||
| #ifdef __aarch64__ | |||
| __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); | |||
| #else | |||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||
| #endif | |||
| #endif | |||
| #ifdef MONITOR | |||
| main_status[cpu] = MAIN_RUNNING1; | |||
| @@ -746,9 +750,13 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| queue -> position = pos; | |||
| #ifdef CONSISTENT_FPCSR | |||
| #ifdef __aarch64__ | |||
| __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue -> sse_mode)); | |||
| #else | |||
| __asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode)); | |||
| __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode)); | |||
| #endif | |||
| #endif | |||
| #if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST) | |||
| @@ -69,6 +69,8 @@ | |||
| int blas_server_avail = 0; | |||
| extern int openblas_omp_adaptive_env(); | |||
| static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; | |||
| #ifdef HAVE_C11 | |||
| static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; | |||
| @@ -282,8 +284,12 @@ static void exec_threads(blas_queue_t *queue, int buf_index){ | |||
| sb = queue -> sb; | |||
| #ifdef CONSISTENT_FPCSR | |||
| #ifdef __aarch64__ | |||
| __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode)); | |||
| #else | |||
| __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); | |||
| __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); | |||
| #endif | |||
| #endif | |||
| if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { | |||
| @@ -381,8 +387,12 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| #ifdef CONSISTENT_FPCSR | |||
| for (i = 0; i < num; i ++) { | |||
| #ifdef __aarch64__ | |||
| __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue[i].sse_mode)); | |||
| #else | |||
| __asm__ __volatile__ ("fnstcw %0" : "=m" (queue[i].x87_mode)); | |||
| __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode)); | |||
| #endif | |||
| } | |||
| #endif | |||
| @@ -278,12 +278,15 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| } else | |||
| #endif | |||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ | |||
| #ifdef BUILD_DOUBLE | |||
| sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| #endif | |||
| } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { | |||
| #ifdef BUILD_SINGLE | |||
| sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| #endif | |||
| } else { | |||
| /* Other types in future */ | |||
| } | |||
| @@ -295,11 +298,15 @@ static DWORD WINAPI blas_thread_server(void *arg){ | |||
| } else | |||
| #endif | |||
| if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ | |||
| #ifdef BUILD_COMPLEX16 | |||
| sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| #endif | |||
| } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) { | |||
| #ifdef BUILD_COMPLEX | |||
| sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) | |||
| + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); | |||
| #endif | |||
| } else { | |||
| /* Other types in future */ | |||
| } | |||
| @@ -1018,7 +1018,7 @@ static gotoblas_t *force_coretype(char *coretype){ | |||
| char message[128]; | |||
| //char mname[20]; | |||
| for ( i=1 ; i <= 24; i++) | |||
| for ( i=1 ; i <= 25; i++) | |||
| { | |||
| if (!strncasecmp(coretype,corename[i],20)) | |||
| { | |||
| @@ -823,6 +823,8 @@ void gotoblas_affinity_init(void) { | |||
| if (numprocs == 0) numprocs = readenv_atoi("OMP_NUM_THREADS"); | |||
| if (numprocs == 0) numprocs = readenv_atoi("OPENBLAS_DEFAULT_NUM_THREADS"); | |||
| numnodes = 1; | |||
| if (numprocs == 1) { | |||
| @@ -67,10 +67,16 @@ void openblas_read_env() { | |||
| openblas_env_thread_timeout=(unsigned int)ret; | |||
| ret=0; | |||
| if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p); | |||
| if (readenv(p,"OPENBLAS_DEFAULT_NUM_THREADS")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| openblas_env_openblas_num_threads=ret; | |||
| ret=0; | |||
| if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| if(ret != 0 || openblas_env_openblas_num_threads == 0) | |||
| openblas_env_openblas_num_threads=ret; | |||
| ret=0; | |||
| if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p); | |||
| if(ret<0) ret=0; | |||
| @@ -82,10 +82,6 @@ else | |||
| vendor=FUJITSU | |||
| openmp='-Kopenmp' | |||
| ;; | |||
| *Cray*) | |||
| vendor=CRAY | |||
| openmp='-fopenmp' | |||
| ;; | |||
| *GNU*|*GCC*) | |||
| v="${data#*GCC: *\) }" | |||
| @@ -117,6 +113,10 @@ else | |||
| esac | |||
| fi | |||
| ;; | |||
| *Cray*) | |||
| vendor=CRAY | |||
| openmp='-fopenmp' | |||
| ;; | |||
| *g95*) | |||
| vendor=G95 | |||
| openmp='' | |||
| @@ -76,11 +76,6 @@ if ($compiler eq "") { | |||
| $vendor = FUJITSU; | |||
| $openmp = "-Kopenmp"; | |||
| } elsif ($data =~ /Cray/) { | |||
| $vendor = CRAY; | |||
| $openmp = "-fopenmp"; | |||
| } elsif ($data =~ /GNU/ || $data =~ /GCC/ ) { | |||
| $data =~ s/\(+.*?\)+//g; | |||
| @@ -106,6 +101,10 @@ if ($compiler eq "") { | |||
| $openmp = ""; | |||
| } | |||
| } | |||
| } elsif ($data =~ /Cray/) { | |||
| $vendor = CRAY; | |||
| $openmp = "-fopenmp"; | |||
| } | |||
| @@ -1410,7 +1410,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| "-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \ | |||
| "-march=armv8.4-a -mtune=neoverse-v1" | |||
| "-march=armv8.4-a+sve -mtune=neoverse-v1" | |||
| #define LIBNAME "neoversev1" | |||
| #define CORENAME "NEOVERSEV1" | |||
| #endif | |||
| @@ -53,7 +53,7 @@ set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES | |||
| # these do not have separate 'z' sources | |||
| set(BLAS3_SOURCES | |||
| gemm.c symm.c | |||
| trsm.c syrk.c syr2k.c | |||
| trsm.c syrk.c syr2k.c gemmt.c | |||
| ) | |||
| set(BLAS3_MANGLED_SOURCES | |||
| @@ -189,7 +189,16 @@ if (NOT DEFINED NO_LAPACK) | |||
| ) | |||
| GenerateNamedObjects("${LAPACK_SOURCES}") | |||
| if (NOT RELAPACK_REPLACE) | |||
| GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" 0 3) | |||
| else () | |||
| GenerateNamedObjects("lapack/getrs.c" "" "" 0 "" "" 0 3) | |||
| GenerateNamedObjects("lapack/getf2.c" "" "" 0 "" "" 0 3) | |||
| GenerateNamedObjects("lapack/potf2.c" "" "" 0 "" "" 0 3) | |||
| GenerateNamedObjects("lapack/laswp.c" "" "" 0 "" "" 0 3) | |||
| GenerateNamedObjects("lapack/lauu2.c" "" "" 0 "" "" 0 3) | |||
| GenerateNamedObjects("lapack/trti2.c" "" "" 0 "" "" 0 3) | |||
| endif() | |||
| endif () | |||
| if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) | |||
| @@ -44,12 +44,12 @@ SBLAS3OBJS = \ | |||
| sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \ | |||
| strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \ | |||
| somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ | |||
| sgeadd.$(SUFFIX) | |||
| sgeadd.$(SUFFIX) sgemmt.$(SUFFIX) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| SBBLAS1OBJS = sbdot.$(SUFFIX) | |||
| SBBLAS2OBJS = sbgemv.$(SUFFIX) | |||
| SBBLAS3OBJS = sbgemm.$(SUFFIX) | |||
| SBBLAS3OBJS = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX) | |||
| SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) | |||
| endif | |||
| @@ -76,7 +76,7 @@ DBLAS3OBJS = \ | |||
| dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \ | |||
| dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \ | |||
| domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\ | |||
| dgeadd.$(SUFFIX) | |||
| dgeadd.$(SUFFIX) dgemmt.$(SUFFIX) | |||
| CBLAS1OBJS = \ | |||
| caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ | |||
| @@ -105,7 +105,7 @@ CBLAS3OBJS = \ | |||
| ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \ | |||
| chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \ | |||
| comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\ | |||
| cgeadd.$(SUFFIX) | |||
| cgeadd.$(SUFFIX) cgemmt.$(SUFFIX) | |||
| ZBLAS1OBJS = \ | |||
| zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ | |||
| @@ -134,7 +134,7 @@ ZBLAS3OBJS = \ | |||
| ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \ | |||
| zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \ | |||
| zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\ | |||
| zgeadd.$(SUFFIX) | |||
| zgeadd.$(SUFFIX) zgemmt.$(SUFFIX) | |||
| ifeq ($(SUPPORT_GEMM3M), 1) | |||
| @@ -281,12 +281,12 @@ CSBLAS2OBJS = \ | |||
| CSBLAS3OBJS = \ | |||
| cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ | |||
| cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ | |||
| cblas_sgeadd.$(SUFFIX) | |||
| cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) | |||
| CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX) | |||
| CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) | |||
| CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) | |||
| CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) | |||
| endif | |||
| @@ -306,7 +306,7 @@ CDBLAS2OBJS = \ | |||
| CDBLAS3OBJS += \ | |||
| cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ | |||
| cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \ | |||
| cblas_dgeadd.$(SUFFIX) | |||
| cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX) | |||
| CCBLAS1OBJS = \ | |||
| cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ | |||
| @@ -331,7 +331,7 @@ CCBLAS3OBJS = \ | |||
| cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ | |||
| cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ | |||
| cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ | |||
| cblas_cgeadd.$(SUFFIX) | |||
| cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX) | |||
| CXERBLAOBJ = \ | |||
| cblas_xerbla.$(SUFFIX) | |||
| @@ -362,7 +362,7 @@ CZBLAS3OBJS = \ | |||
| cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ | |||
| cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ | |||
| cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \ | |||
| cblas_zgeadd.$(SUFFIX) | |||
| cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX) | |||
| ifeq ($(SUPPORT_GEMM3M), 1) | |||
| @@ -1300,6 +1300,8 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| sbgemmt.$(SUFFIX) sbgemm.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| @@ -1320,6 +1322,24 @@ zgemm.$(SUFFIX) zgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| sgemmt.$(SUFFIX) sgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| dgemmt.$(SUFFIX) dgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| qgemmt.$(SUFFIX) qgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| xgemmt.$(SUFFIX) xgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| @@ -1907,6 +1927,23 @@ cblas_cgemm.$(SUFFIX) cblas_cgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| cblas_zgemm.$(SUFFIX) cblas_zgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| ifeq ($(BUILD_BFLOAT16),1) | |||
| cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| cblas_dgemmt.$(SUFFIX) cblas_dgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| cblas_cgemmt.$(SUFFIX) cblas_cgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| cblas_zgemmt.$(SUFFIX) cblas_zgemmt.$(PSUFFIX) : gemmt.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| cblas_ssymm.$(SUFFIX) cblas_ssymm.$(PSUFFIX) : symm.c | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| @@ -0,0 +1,589 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2022, The OpenBLAS Project. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #include "common.h" | |||
| #ifdef FUNCTION_PROFILE | |||
| #include "functable.h" | |||
| #endif | |||
| #ifndef COMPLEX | |||
| #define SMP_THRESHOLD_MIN 65536.0 | |||
| #ifdef XDOUBLE | |||
| #define ERROR_NAME "QGEMT " | |||
| #elif defined(DOUBLE) | |||
| #define ERROR_NAME "DGEMT " | |||
| #elif defined(BFLOAT16) | |||
| #define ERROR_NAME "SBGEMT " | |||
| #else | |||
| #define ERROR_NAME "SGEMT " | |||
| #endif | |||
| #else | |||
| #define SMP_THRESHOLD_MIN 8192.0 | |||
| #ifdef XDOUBLE | |||
| #define ERROR_NAME "XGEMT " | |||
| #elif defined(DOUBLE) | |||
| #define ERROR_NAME "ZGEMT " | |||
| #else | |||
| #define ERROR_NAME "CGEMT " | |||
| #endif | |||
| #endif | |||
| #ifndef GEMM_MULTITHREAD_THRESHOLD | |||
| #define GEMM_MULTITHREAD_THRESHOLD 4 | |||
| #endif | |||
| #ifndef CBLAS | |||
| void NAME(char *UPLO, char *TRANSA, char *TRANSB, | |||
| blasint * M, blasint * N, blasint * K, | |||
| FLOAT * Alpha, | |||
| IFLOAT * a, blasint * ldA, | |||
| IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC) | |||
| { | |||
| blasint m, n, k; | |||
| blasint lda, ldb, ldc; | |||
| int transa, transb, uplo; | |||
| blasint info; | |||
| char transA, transB, Uplo; | |||
| IFLOAT *buffer; | |||
| IFLOAT *aa, *bb; | |||
| FLOAT *cc; | |||
| #if defined(COMPLEX) | |||
| FLOAT alpha_r, alpha_i, beta_r, beta_i; | |||
| #else | |||
| FLOAT alpha, beta; | |||
| #endif | |||
| PRINT_DEBUG_NAME; | |||
| m = *M; | |||
| n = *N; | |||
| k = *K; | |||
| #if defined(COMPLEX) | |||
| FLOAT *alpha = Alpha; | |||
| alpha_r = *(Alpha + 0); | |||
| alpha_i = *(Alpha + 1); | |||
| beta_r = *(Beta + 0); | |||
| beta_i = *(Beta + 1); | |||
| #else | |||
| alpha = *Alpha; | |||
| beta = *Beta; | |||
| #endif | |||
| lda = *ldA; | |||
| ldb = *ldB; | |||
| ldc = *ldC; | |||
| transA = *TRANSA; | |||
| transB = *TRANSB; | |||
| Uplo = *UPLO; | |||
| TOUPPER(transA); | |||
| TOUPPER(transB); | |||
| TOUPPER(Uplo); | |||
| transa = -1; | |||
| transb = -1; | |||
| uplo = -1; | |||
| if (transA == 'N') | |||
| transa = 0; | |||
| if (transA == 'T') | |||
| transa = 1; | |||
| #ifndef COMPLEX | |||
| if (transA == 'R') | |||
| transa = 0; | |||
| if (transA == 'C') | |||
| transa = 1; | |||
| #else | |||
| if (transA == 'R') | |||
| transa = 2; | |||
| if (transA == 'C') | |||
| transa = 3; | |||
| #endif | |||
| if (transB == 'N') | |||
| transb = 0; | |||
| if (transB == 'T') | |||
| transb = 1; | |||
| #ifndef COMPLEX | |||
| if (transB == 'R') | |||
| transb = 0; | |||
| if (transB == 'C') | |||
| transb = 1; | |||
| #else | |||
| if (transB == 'R') | |||
| transb = 2; | |||
| if (transB == 'C') | |||
| transb = 3; | |||
| #endif | |||
| if (Uplo == 'U') | |||
| uplo = 0; | |||
| if (Uplo == 'L') | |||
| uplo = 1; | |||
| info = 0; | |||
| if (uplo < 0) | |||
| info = 14; | |||
| if (ldc < m) | |||
| info = 13; | |||
| if (k < 0) | |||
| info = 5; | |||
| if (n < 0) | |||
| info = 4; | |||
| if (m < 0) | |||
| info = 3; | |||
| if (transb < 0) | |||
| info = 2; | |||
| if (transa < 0) | |||
| info = 1; | |||
| if (info) { | |||
| BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
| return; | |||
| } | |||
| #else | |||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, | |||
| blasint N, blasint k, | |||
| #ifndef COMPLEX | |||
| FLOAT alpha, | |||
| IFLOAT * A, blasint LDA, | |||
| IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc) | |||
| { | |||
| #else | |||
| void *valpha, | |||
| void *va, blasint LDA, | |||
| void *vb, blasint LDB, void *vbeta, void *vc, blasint ldc) | |||
| { | |||
| FLOAT *alpha = (FLOAT *) valpha; | |||
| FLOAT *beta = (FLOAT *) vbeta; | |||
| FLOAT *A = (FLOAT *) va; | |||
| FLOAT *B = (FLOAT *) vb; | |||
| FLOAT *c = (FLOAT *) vc; | |||
| #endif | |||
| FLOAT *aa, *bb, *cc; | |||
| int transa, transb, uplo; | |||
| blasint info; | |||
| blasint m, n, lda, ldb; | |||
| FLOAT *a, *b; | |||
| XFLOAT *buffer; | |||
| PRINT_DEBUG_CNAME; | |||
| transa = -1; | |||
| transb = -1; | |||
| info = 0; | |||
| if (order == CblasColMajor) { | |||
| if (TransA == CblasNoTrans) | |||
| transa = 0; | |||
| if (TransA == CblasTrans) | |||
| transa = 1; | |||
| #ifndef COMPLEX | |||
| if (TransA == CblasConjNoTrans) | |||
| transa = 0; | |||
| if (TransA == CblasConjTrans) | |||
| transa = 1; | |||
| #else | |||
| if (TransA == CblasConjNoTrans) | |||
| transa = 2; | |||
| if (TransA == CblasConjTrans) | |||
| transa = 3; | |||
| #endif | |||
| if (TransB == CblasNoTrans) | |||
| transb = 0; | |||
| if (TransB == CblasTrans) | |||
| transb = 1; | |||
| #ifndef COMPLEX | |||
| if (TransB == CblasConjNoTrans) | |||
| transb = 0; | |||
| if (TransB == CblasConjTrans) | |||
| transb = 1; | |||
| #else | |||
| if (TransB == CblasConjNoTrans) | |||
| transb = 2; | |||
| if (TransB == CblasConjTrans) | |||
| transb = 3; | |||
| #endif | |||
| m = M; | |||
| n = N; | |||
| a = (void *)A; | |||
| b = (void *)B; | |||
| lda = LDA; | |||
| ldb = LDB; | |||
| info = -1; | |||
| if (ldc < m) | |||
| info = 13; | |||
| if (k < 0) | |||
| info = 5; | |||
| if (n < 0) | |||
| info = 4; | |||
| if (m < 0) | |||
| info = 3; | |||
| if (transb < 0) | |||
| info = 2; | |||
| if (transa < 0) | |||
| info = 1; | |||
| } | |||
| if (order == CblasRowMajor) { | |||
| m = N; | |||
| n = M; | |||
| a = (void *)B; | |||
| b = (void *)A; | |||
| lda = LDB; | |||
| ldb = LDA; | |||
| if (TransB == CblasNoTrans) | |||
| transa = 0; | |||
| if (TransB == CblasTrans) | |||
| transa = 1; | |||
| #ifndef COMPLEX | |||
| if (TransB == CblasConjNoTrans) | |||
| transa = 0; | |||
| if (TransB == CblasConjTrans) | |||
| transa = 1; | |||
| #else | |||
| if (TransB == CblasConjNoTrans) | |||
| transa = 2; | |||
| if (TransB == CblasConjTrans) | |||
| transa = 3; | |||
| #endif | |||
| if (TransA == CblasNoTrans) | |||
| transb = 0; | |||
| if (TransA == CblasTrans) | |||
| transb = 1; | |||
| #ifndef COMPLEX | |||
| if (TransA == CblasConjNoTrans) | |||
| transb = 0; | |||
| if (TransA == CblasConjTrans) | |||
| transb = 1; | |||
| #else | |||
| if (TransA == CblasConjNoTrans) | |||
| transb = 2; | |||
| if (TransA == CblasConjTrans) | |||
| transb = 3; | |||
| #endif | |||
| info = -1; | |||
| if (ldc < m) | |||
| info = 13; | |||
| if (k < 0) | |||
| info = 5; | |||
| if (n < 0) | |||
| info = 4; | |||
| if (m < 0) | |||
| info = 3; | |||
| if (transb < 0) | |||
| info = 2; | |||
| if (transa < 0) | |||
| info = 1; | |||
| } | |||
| uplo = -1; | |||
| if (Uplo == CblasUpper) | |||
| uplo = 0; | |||
| if (Uplo == CblasLower) | |||
| uplo = 1; | |||
| if (uplo < 0) | |||
| info = 14; | |||
| if (info >= 0) { | |||
| BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
| return; | |||
| } | |||
| #if defined(COMPLEX) | |||
| FLOAT alpha_r = *(alpha + 0); | |||
| FLOAT alpha_i = *(alpha + 1); | |||
| FLOAT beta_r = *(beta + 0); | |||
| FLOAT beta_i = *(beta + 1); | |||
| #endif | |||
| #endif | |||
| int buffer_size; | |||
| blasint l; | |||
| blasint i, j; | |||
| #ifdef SMP | |||
| int nthreads; | |||
| #endif | |||
| #if defined(COMPLEX) | |||
| #ifdef SMP | |||
| static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT *, FLOAT *, | |||
| BLASLONG, FLOAT *, BLASLONG, FLOAT *, | |||
| BLASLONG, FLOAT *, int) = { | |||
| #ifdef XDOUBLE | |||
| xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c, | |||
| xgemv_thread_o, xgemv_thread_u, xgemv_thread_s, | |||
| xgemv_thread_d, | |||
| #elif defined DOUBLE | |||
| zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c, | |||
| zgemv_thread_o, zgemv_thread_u, zgemv_thread_s, | |||
| zgemv_thread_d, | |||
| #else | |||
| cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c, | |||
| cgemv_thread_o, cgemv_thread_u, cgemv_thread_s, | |||
| cgemv_thread_d, | |||
| #endif | |||
| }; | |||
| #endif | |||
| int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *, | |||
| BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG, | |||
| FLOAT *) = { | |||
| GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D,}; | |||
| #else | |||
| #ifdef SMP | |||
| static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, FLOAT *, | |||
| BLASLONG, FLOAT *, BLASLONG, FLOAT *, | |||
| BLASLONG, FLOAT *, int) = { | |||
| #ifdef XDOUBLE | |||
| qgemv_thread_n, qgemv_thread_t, | |||
| #elif defined DOUBLE | |||
| dgemv_thread_n, dgemv_thread_t, | |||
| #else | |||
| sgemv_thread_n, sgemv_thread_t, | |||
| #endif | |||
| }; | |||
| #endif | |||
| int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG, | |||
| FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = { | |||
| GEMV_N, GEMV_T,}; | |||
| #endif | |||
| if ((m == 0) || (n == 0)) | |||
| return; | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| const blasint incb = (transb == 0) ? 1 : ldb; | |||
| if (uplo == 1) { | |||
| for (i = 0; i < n; i++) { | |||
| j = n - i; | |||
| l = j; | |||
| #if defined(COMPLEX) | |||
| aa = a + i * 2; | |||
| bb = b + i * ldb * 2; | |||
| if (transa) { | |||
| l = k; | |||
| aa = a + lda * i * 2; | |||
| bb = b + i * 2; | |||
| } | |||
| cc = c + i * 2 * ldc + i * 2; | |||
| #else | |||
| aa = a + i; | |||
| bb = b + i * ldb; | |||
| if (transa) { | |||
| l = k; | |||
| aa = a + lda * i; | |||
| bb = b + i; | |||
| } | |||
| cc = c + i * ldc + i; | |||
| #endif | |||
| #if defined(COMPLEX) | |||
| if (beta_r != ONE || beta_i != ZERO) | |||
| SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0, | |||
| NULL, 0); | |||
| if (alpha_r == ZERO && alpha_i == ZERO) | |||
| return; | |||
| #else | |||
| if (beta != ONE) | |||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) | |||
| continue; | |||
| #endif | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| buffer_size = j + k + 128 / sizeof(FLOAT); | |||
| #ifdef WINDOWS_ABI | |||
| buffer_size += 160 / sizeof(FLOAT); | |||
| #endif | |||
| // for alignment | |||
| buffer_size = (buffer_size + 3) & ~3; | |||
| STACK_ALLOC(buffer_size, FLOAT, buffer); | |||
| #ifdef SMP | |||
| if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(2); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| #if defined(COMPLEX) | |||
| (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, | |||
| aa, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| #else | |||
| (gemv[(int)transa]) (j, k, 0, alpha, aa, lda, | |||
| bb, incb, cc, 1, buffer); | |||
| #endif | |||
| #ifdef SMP | |||
| } else { | |||
| (gemv_thread[(int)transa]) (j, k, alpha, aa, | |||
| lda, bb, incb, cc, | |||
| 1, buffer, | |||
| nthreads); | |||
| } | |||
| #endif | |||
| STACK_FREE(buffer); | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++) { | |||
| j = i + 1; | |||
| l = j; | |||
| #if defined COMPLEX | |||
| bb = b + i * ldb * 2; | |||
| if (transa) { | |||
| l = k; | |||
| bb = b + i * 2; | |||
| } | |||
| cc = c + i * 2 * ldc; | |||
| #else | |||
| bb = b + i * ldb; | |||
| if (transa) { | |||
| l = k; | |||
| bb = b + i; | |||
| } | |||
| cc = c + i * ldc; | |||
| #endif | |||
| #if defined(COMPLEX) | |||
| if (beta_r != ONE || beta_i != ZERO) | |||
| SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0, | |||
| NULL, 0); | |||
| if (alpha_r == ZERO && alpha_i == ZERO) | |||
| return; | |||
| #else | |||
| if (beta != ONE) | |||
| SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0); | |||
| if (alpha == ZERO) | |||
| continue; | |||
| #endif | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| buffer_size = j + k + 128 / sizeof(FLOAT); | |||
| #ifdef WINDOWS_ABI | |||
| buffer_size += 160 / sizeof(FLOAT); | |||
| #endif | |||
| // for alignment | |||
| buffer_size = (buffer_size + 3) & ~3; | |||
| STACK_ALLOC(buffer_size, FLOAT, buffer); | |||
| #ifdef SMP | |||
| if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(2); | |||
| if (nthreads == 1) { | |||
| #endif | |||
| #if defined(COMPLEX) | |||
| (gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i, | |||
| a, lda, bb, incb, cc, 1, | |||
| buffer); | |||
| #else | |||
| (gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb, | |||
| incb, cc, 1, buffer); | |||
| #endif | |||
| #ifdef SMP | |||
| } else { | |||
| (gemv_thread[(int)transa]) (j, k, alpha, a, lda, | |||
| bb, incb, cc, 1, | |||
| buffer, nthreads); | |||
| } | |||
| #endif | |||
| STACK_FREE(buffer); | |||
| } | |||
| } | |||
| FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, | |||
| args.m * args.k + args.k * args.n + | |||
| args.m * args.n, 2 * args.m * args.n * args.k); | |||
| IDEBUG_END; | |||
| return; | |||
| } | |||
| @@ -23,7 +23,7 @@ ifeq ($(C_COMPILER), CLANG) | |||
| # Any clang posing as gcc 4.2 should be new enough (3.4 or later) | |||
| GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2) | |||
| ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) | |||
| AVX2OPT = -mavx2 | |||
| AVX2OPT = -mavx2 -mfma | |||
| endif | |||
| endif | |||
| ifdef NO_AVX2 | |||
| @@ -73,6 +73,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX) | |||
| endif | |||
| else ifeq ($(TARGET_CORE), HASWELL) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) | |||
| else ifeq ($(TARGET_CORE), ZEN) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) | |||
| else ifeq ($(TARGET_CORE), LOONGSON3R4) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) | |||
| else | |||
| @@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| DDOTKERNEL = dot_thunderx2t99.c | |||
| SDOTKERNEL = dot_thunderx2t99.c | |||
| DDOTKERNEL = dot.c | |||
| SDOTKERNEL = dot.c | |||
| CDOTKERNEL = zdot_thunderx2t99.c | |||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||
| DSDOTKERNEL = dot.S | |||
| @@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| DDOTKERNEL = dot_thunderx2t99.c | |||
| SDOTKERNEL = dot_thunderx2t99.c | |||
| DDOTKERNEL = dot.c | |||
| SDOTKERNEL = dot.c | |||
| CDOTKERNEL = zdot_thunderx2t99.c | |||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||
| DSDOTKERNEL = dot.S | |||
| @@ -190,10 +190,10 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMM_BETA = sbgemm_beta_neoversen2.c | |||
| SBGEMMKERNEL = sbgemm_kernel_$(SBGEMM_UNROLL_M)x$(SBGEMM_UNROLL_N)_neoversen2.c | |||
| SBGEMMINCOPY = sbgemm_ncopy_neoversen2.c | |||
| SBGEMMITCOPY = sbgemm_tcopy_neoversen2.c | |||
| SBGEMMONCOPY = sbgemm_ncopy_neoversen2.c | |||
| SBGEMMOTCOPY = sbgemm_tcopy_neoversen2.c | |||
| SBGEMMINCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversen2.c | |||
| SBGEMMITCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversen2.c | |||
| SBGEMMONCOPY = sbgemm_ncopy_$(SBGEMM_UNROLL_N)_neoversen2.c | |||
| SBGEMMOTCOPY = sbgemm_tcopy_$(SBGEMM_UNROLL_M)_neoversen2.c | |||
| SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -96,8 +96,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| DDOTKERNEL = dot_thunderx2t99.c | |||
| SDOTKERNEL = dot_thunderx2t99.c | |||
| DDOTKERNEL = dot.c | |||
| SDOTKERNEL = dot.c | |||
| CDOTKERNEL = zdot_thunderx2t99.c | |||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||
| DSDOTKERNEL = dot.S | |||
| @@ -161,8 +161,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| DDOTKERNEL = dot_thunderx2t99.c | |||
| SDOTKERNEL = dot_thunderx2t99.c | |||
| DDOTKERNEL = dot.c | |||
| SDOTKERNEL = dot.c | |||
| CDOTKERNEL = zdot_thunderx2t99.c | |||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||
| DSDOTKERNEL = dot.S | |||
| @@ -161,8 +161,8 @@ DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| DDOTKERNEL = dot_thunderx2t99.c | |||
| SDOTKERNEL = dot_thunderx2t99.c | |||
| DDOTKERNEL = dot.c | |||
| SDOTKERNEL = dot.c | |||
| CDOTKERNEL = zdot_thunderx2t99.c | |||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||
| DSDOTKERNEL = dot.S | |||
| @@ -0,0 +1,121 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2017, The OpenBLAS Project | |||
| Copyright (c) 2022, Arm Ltd | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| // Some compilers will report feature support for SVE without the appropriate | |||
| // header available | |||
| #ifdef HAVE_SVE | |||
| #if defined __has_include | |||
| #if __has_include(<arm_sve.h>) && __ARM_FEATURE_SVE | |||
| #define USE_SVE | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #ifdef USE_SVE | |||
| #include "dot_kernel_sve.c" | |||
| #endif | |||
| #include "dot_kernel_asimd.c" | |||
| #if defined(SMP) | |||
| extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, | |||
| BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, | |||
| void *c, BLASLONG ldc, int (*function)(), int nthreads); | |||
| #endif | |||
| static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| RETURN_TYPE dot = 0.0 ; | |||
| if ( n <= 0 ) return dot; | |||
| #ifdef USE_SVE | |||
| if (inc_x == 1 && inc_y == 1) { | |||
| return dot_kernel_sve(n, x, y); | |||
| } | |||
| #endif | |||
| return dot_kernel_asimd(n, x, inc_x, y, inc_y); | |||
| } | |||
| #if defined(SMP) | |||
| static int dot_thread_function(BLASLONG n, BLASLONG dummy0, | |||
| BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||
| BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) | |||
| { | |||
| *(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y); | |||
| return 0; | |||
| } | |||
| #endif | |||
| RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| #if defined(SMP) | |||
| int nthreads; | |||
| FLOAT dummy_alpha; | |||
| #endif | |||
| RETURN_TYPE dot = 0.0; | |||
| #if defined(SMP) | |||
| if (inc_x == 0 || inc_y == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| dot = dot_compute(n, x, inc_x, y, inc_y); | |||
| } else { | |||
| int mode, i; | |||
| char result[MAX_CPU_NUMBER * sizeof(double) * 2]; | |||
| RETURN_TYPE *ptr; | |||
| #if !defined(DOUBLE) | |||
| mode = BLAS_SINGLE | BLAS_REAL; | |||
| #else | |||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||
| #endif | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, | |||
| x, inc_x, y, inc_y, result, 0, | |||
| ( void *)dot_thread_function, nthreads); | |||
| ptr = (RETURN_TYPE *)result; | |||
| for (i = 0; i < nthreads; i++) { | |||
| dot = dot + (*ptr); | |||
| ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2); | |||
| } | |||
| } | |||
| #else | |||
| dot = dot_compute(n, x, inc_x, y, inc_y); | |||
| #endif | |||
| return dot; | |||
| } | |||
| @@ -1,5 +1,6 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2017, The OpenBLAS Project | |||
| Copyright (c) 2022, Arm Ltd | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| @@ -36,25 +37,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define RETURN_TYPE double | |||
| #endif | |||
| #define N "x0" /* vector length */ | |||
| #define X "x1" /* "X" vector address */ | |||
| #define INC_X "x2" /* "X" stride */ | |||
| #define Y "x3" /* "Y" vector address */ | |||
| #define INC_Y "x4" /* "Y" stride */ | |||
| #define J "x5" /* loop variable */ | |||
| #if !defined(DOUBLE) | |||
| #if !defined(DSDOT) | |||
| #define DOT_MOD "s" | |||
| #define REG0 "wzr" | |||
| #define DOTF "s0" | |||
| #define TMPX "s16" | |||
| #define TMPY "s24" | |||
| #define INC_SHIFT "2" | |||
| #define N_DIV_SHIFT "6" | |||
| #define N_REM_MASK "63" | |||
| #else | |||
| #define DOT_MOD "d" | |||
| #define REG0 "xzr" | |||
| #define DOTF "d0" | |||
| #define TMPX "s16" | |||
| #define TMPX1 "d2" | |||
| #define TMPY "s24" | |||
| @@ -64,8 +58,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define N_REM_MASK "15" | |||
| #endif | |||
| #else | |||
| #define DOT_MOD "d" | |||
| #define REG0 "xzr" | |||
| #define DOTF "d0" | |||
| #define TMPX "d16" | |||
| #define TMPY "d24" | |||
| #define INC_SHIFT "3" | |||
| @@ -73,59 +67,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define N_REM_MASK "31" | |||
| #endif | |||
| #define OUT "%"DOT_MOD"[DOT_]" | |||
| #if !defined(DOUBLE) | |||
| #if !defined(DSDOT) | |||
| #define KERNEL_F1 \ | |||
| " ldr "TMPX", ["X"] \n" \ | |||
| " ldr "TMPY", ["Y"] \n" \ | |||
| " add "X", "X", "INC_X" \n" \ | |||
| " add "Y", "Y", "INC_Y" \n" \ | |||
| " fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n" | |||
| " ldr "TMPX", [%[X_]] \n" \ | |||
| " ldr "TMPY", [%[Y_]] \n" \ | |||
| " add %[X_], %[X_], %[INCX_] \n" \ | |||
| " add %[Y_], %[Y_], %[INCY_] \n" \ | |||
| " fmadd "OUT", "TMPX", "TMPY", "OUT" \n" | |||
| #define KERNEL_F \ | |||
| " ldp q16, q17, ["X"] \n" \ | |||
| " ldp q24, q25, ["Y"] \n" \ | |||
| " ldp q18, q19, ["X", #32] \n" \ | |||
| " ldp q26, q27, ["Y", #32] \n" \ | |||
| " ldp q16, q17, [%[X_]] \n" \ | |||
| " ldp q24, q25, [%[Y_]] \n" \ | |||
| " ldp q18, q19, [%[X_], #32] \n" \ | |||
| " ldp q26, q27, [%[Y_], #32] \n" \ | |||
| " fmla v0.4s, v16.4s, v24.4s \n" \ | |||
| " fmla v1.4s, v17.4s, v25.4s \n" \ | |||
| " ldp q20, q21, ["X", #64] \n" \ | |||
| " ldp q28, q29, ["Y", #64] \n" \ | |||
| " ldp q20, q21, [%[X_], #64] \n" \ | |||
| " ldp q28, q29, [%[Y_], #64] \n" \ | |||
| " fmla v2.4s, v18.4s, v26.4s \n" \ | |||
| " fmla v3.4s, v19.4s, v27.4s \n" \ | |||
| " ldp q22, q23, ["X", #96] \n" \ | |||
| " ldp q30, q31, ["Y", #96] \n" \ | |||
| " add "Y", "Y", #128 \n" \ | |||
| " add "X", "X", #128 \n" \ | |||
| " ldp q22, q23, [%[X_], #96] \n" \ | |||
| " ldp q30, q31, [%[Y_], #96] \n" \ | |||
| " add %[Y_], %[Y_], #128 \n" \ | |||
| " add %[X_], %[X_], #128 \n" \ | |||
| " fmla v4.4s, v20.4s, v28.4s \n" \ | |||
| " fmla v5.4s, v21.4s, v29.4s \n" \ | |||
| " PRFM PLDL1KEEP, ["X", #896] \n" \ | |||
| " PRFM PLDL1KEEP, ["Y", #896] \n" \ | |||
| " PRFM PLDL1KEEP, ["X", #896+64] \n" \ | |||
| " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ | |||
| " PRFM PLDL1KEEP, [%[X_], #896] \n" \ | |||
| " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ | |||
| " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ | |||
| " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ | |||
| " fmla v6.4s, v22.4s, v30.4s \n" \ | |||
| " fmla v7.4s, v23.4s, v31.4s \n" \ | |||
| " ldp q16, q17, ["X"] \n" \ | |||
| " ldp q24, q25, ["Y"] \n" \ | |||
| " ldp q18, q19, ["X", #32] \n" \ | |||
| " ldp q26, q27, ["Y", #32] \n" \ | |||
| " ldp q16, q17, [%[X_]] \n" \ | |||
| " ldp q24, q25, [%[Y_]] \n" \ | |||
| " ldp q18, q19, [%[X_], #32] \n" \ | |||
| " ldp q26, q27, [%[Y_], #32] \n" \ | |||
| " fmla v0.4s, v16.4s, v24.4s \n" \ | |||
| " fmla v1.4s, v17.4s, v25.4s \n" \ | |||
| " ldp q20, q21, ["X", #64] \n" \ | |||
| " ldp q28, q29, ["Y", #64] \n" \ | |||
| " ldp q20, q21, [%[X_], #64] \n" \ | |||
| " ldp q28, q29, [%[Y_], #64] \n" \ | |||
| " fmla v2.4s, v18.4s, v26.4s \n" \ | |||
| " fmla v3.4s, v19.4s, v27.4s \n" \ | |||
| " ldp q22, q23, ["X", #96] \n" \ | |||
| " ldp q30, q31, ["Y", #96] \n" \ | |||
| " add "Y", "Y", #128 \n" \ | |||
| " add "X", "X", #128 \n" \ | |||
| " ldp q22, q23, [%[X_], #96] \n" \ | |||
| " ldp q30, q31, [%[Y_], #96] \n" \ | |||
| " add %[Y_], %[Y_], #128 \n" \ | |||
| " add %[X_], %[X_], #128 \n" \ | |||
| " fmla v4.4s, v20.4s, v28.4s \n" \ | |||
| " fmla v5.4s, v21.4s, v29.4s \n" \ | |||
| " PRFM PLDL1KEEP, ["X", #896] \n" \ | |||
| " PRFM PLDL1KEEP, ["Y", #896] \n" \ | |||
| " PRFM PLDL1KEEP, ["X", #896+64] \n" \ | |||
| " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ | |||
| " PRFM PLDL1KEEP, [%[X_], #896] \n" \ | |||
| " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ | |||
| " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ | |||
| " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ | |||
| " fmla v6.4s, v22.4s, v30.4s \n" \ | |||
| " fmla v7.4s, v23.4s, v31.4s \n" | |||
| @@ -142,19 +138,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else /* !defined(DSDOT) */ | |||
| #define KERNEL_F1 \ | |||
| " ldr "TMPX", ["X"] \n" \ | |||
| " ldr "TMPY", ["Y"] \n" \ | |||
| " add "X", "X", "INC_X" \n" \ | |||
| " add "Y", "Y", "INC_Y" \n" \ | |||
| " ldr "TMPX", [%[X_]] \n" \ | |||
| " ldr "TMPY", [%[Y_]] \n" \ | |||
| " add %[X_], %[X_], %[INCX_] \n" \ | |||
| " add %[Y_], %[Y_], %[INCY_] \n" \ | |||
| " fcvt "TMPX1", "TMPX" \n" \ | |||
| " fcvt "TMPY1", "TMPY" \n" \ | |||
| " fmul "TMPX1", "TMPX1", "TMPY1" \n" \ | |||
| " fadd "DOTF", "DOTF", "TMPX1" \n" | |||
| " fadd "OUT", "OUT", "TMPX1" \n" | |||
| #define KERNEL_F \ | |||
| " ldp q18, q19, ["X"] \n" \ | |||
| " ldp q26, q27, ["Y"] \n" \ | |||
| " ldp q18, q19, [%[X_]] \n" \ | |||
| " ldp q26, q27, [%[Y_]] \n" \ | |||
| " fcvtl v16.2d, v18.2s \n" \ | |||
| " fcvtl2 v17.2d, v18.4s \n" \ | |||
| " fcvtl v18.2d, v19.2s \n" \ | |||
| @@ -163,8 +159,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| " fcvtl2 v25.2d, v26.4s \n" \ | |||
| " fcvtl v26.2d, v27.2s \n" \ | |||
| " fcvtl2 v27.2d, v27.4s \n" \ | |||
| " ldp q22, q23, ["X", #32] \n" \ | |||
| " ldp q30, q31, ["Y", #32] \n" \ | |||
| " ldp q22, q23, [%[X_], #32] \n" \ | |||
| " ldp q30, q31, [%[Y_], #32] \n" \ | |||
| " fcvtl v20.2d, v22.2s \n" \ | |||
| " fcvtl2 v21.2d, v22.4s \n" \ | |||
| " fcvtl v22.2d, v23.2s \n" \ | |||
| @@ -173,16 +169,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| " fcvtl2 v29.2d, v30.4s \n" \ | |||
| " fcvtl v30.2d, v31.2s \n" \ | |||
| " fcvtl2 v31.2d, v31.4s \n" \ | |||
| " PRFM PLDL1KEEP, ["X", #896] \n" \ | |||
| " PRFM PLDL1KEEP, ["Y", #896] \n" \ | |||
| " PRFM PLDL1KEEP, ["X", #896+64] \n" \ | |||
| " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ | |||
| " PRFM PLDL1KEEP, [%[X_], #896] \n" \ | |||
| " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ | |||
| " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ | |||
| " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ | |||
| " fmla v0.2d, v16.2d, v24.2d \n" \ | |||
| " fmla v1.2d, v17.2d, v25.2d \n" \ | |||
| " fmla v2.2d, v18.2d, v26.2d \n" \ | |||
| " fmla v3.2d, v19.2d, v27.2d \n" \ | |||
| " add "Y", "Y", #64 \n" \ | |||
| " add "X", "X", #64 \n" \ | |||
| " add %[Y_], %[Y_], #64 \n" \ | |||
| " add %[X_], %[X_], #64 \n" \ | |||
| " fmla v4.2d, v20.2d, v28.2d \n" \ | |||
| " fmla v5.2d, v21.2d, v29.2d \n" \ | |||
| " fmla v6.2d, v22.2d, v30.2d \n" \ | |||
| @@ -196,60 +192,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| " fadd v0.2d, v0.2d, v2.2d \n" \ | |||
| " fadd v4.2d, v4.2d, v6.2d \n" \ | |||
| " fadd v0.2d, v0.2d, v4.2d \n" \ | |||
| " faddp "DOTF", v0.2d \n" | |||
| " faddp "OUT", v0.2d \n" | |||
| #endif /* !defined(DSDOT) */ | |||
| #else /* !defined(DOUBLE) */ | |||
| #define KERNEL_F1 \ | |||
| " ldr "TMPX", ["X"] \n" \ | |||
| " ldr "TMPY", ["Y"] \n" \ | |||
| " add "X", "X", "INC_X" \n" \ | |||
| " add "Y", "Y", "INC_Y" \n" \ | |||
| " fmadd "DOTF", "TMPX", "TMPY", "DOTF" \n" | |||
| " ldr "TMPX", [%[X_]] \n" \ | |||
| " ldr "TMPY", [%[Y_]] \n" \ | |||
| " add %[X_], %[X_], %[INCX_] \n" \ | |||
| " add %[Y_], %[Y_], %[INCY_] \n" \ | |||
| " fmadd "OUT", "TMPX", "TMPY", "OUT" \n" | |||
| #define KERNEL_F \ | |||
| " ldp q16, q17, ["X"] \n" \ | |||
| " ldp q24, q25, ["Y"] \n" \ | |||
| " ldp q18, q19, ["X", #32] \n" \ | |||
| " ldp q26, q27, ["Y", #32] \n" \ | |||
| " ldp q16, q17, [%[X_]] \n" \ | |||
| " ldp q24, q25, [%[Y_]] \n" \ | |||
| " ldp q18, q19, [%[X_], #32] \n" \ | |||
| " ldp q26, q27, [%[Y_], #32] \n" \ | |||
| " fmla v0.2d, v16.2d, v24.2d \n" \ | |||
| " fmla v1.2d, v17.2d, v25.2d \n" \ | |||
| " ldp q20, q21, ["X", #64] \n" \ | |||
| " ldp q28, q29, ["Y", #64] \n" \ | |||
| " ldp q20, q21, [%[X_], #64] \n" \ | |||
| " ldp q28, q29, [%[Y_], #64] \n" \ | |||
| " fmla v2.2d, v18.2d, v26.2d \n" \ | |||
| " fmla v3.2d, v19.2d, v27.2d \n" \ | |||
| " ldp q22, q23, ["X", #96] \n" \ | |||
| " ldp q30, q31, ["Y", #96] \n" \ | |||
| " add "Y", "Y", #128 \n" \ | |||
| " add "X", "X", #128 \n" \ | |||
| " ldp q22, q23, [%[X_], #96] \n" \ | |||
| " ldp q30, q31, [%[Y_], #96] \n" \ | |||
| " add %[Y_], %[Y_], #128 \n" \ | |||
| " add %[X_], %[X_], #128 \n" \ | |||
| " fmla v4.2d, v20.2d, v28.2d \n" \ | |||
| " fmla v5.2d, v21.2d, v29.2d \n" \ | |||
| " PRFM PLDL1KEEP, ["X", #896] \n" \ | |||
| " PRFM PLDL1KEEP, ["Y", #896] \n" \ | |||
| " PRFM PLDL1KEEP, ["X", #896+64] \n" \ | |||
| " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ | |||
| " PRFM PLDL1KEEP, [%[X_], #896] \n" \ | |||
| " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ | |||
| " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ | |||
| " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ | |||
| " fmla v6.2d, v22.2d, v30.2d \n" \ | |||
| " fmla v7.2d, v23.2d, v31.2d \n" \ | |||
| " ldp q16, q17, ["X"] \n" \ | |||
| " ldp q24, q25, ["Y"] \n" \ | |||
| " ldp q18, q19, ["X", #32] \n" \ | |||
| " ldp q26, q27, ["Y", #32] \n" \ | |||
| " ldp q16, q17, [%[X_]] \n" \ | |||
| " ldp q24, q25, [%[Y_]] \n" \ | |||
| " ldp q18, q19, [%[X_], #32] \n" \ | |||
| " ldp q26, q27, [%[Y_], #32] \n" \ | |||
| " fmla v0.2d, v16.2d, v24.2d \n" \ | |||
| " fmla v1.2d, v17.2d, v25.2d \n" \ | |||
| " ldp q20, q21, ["X", #64] \n" \ | |||
| " ldp q28, q29, ["Y", #64] \n" \ | |||
| " ldp q20, q21, [%[X_], #64] \n" \ | |||
| " ldp q28, q29, [%[Y_], #64] \n" \ | |||
| " fmla v2.2d, v18.2d, v26.2d \n" \ | |||
| " fmla v3.2d, v19.2d, v27.2d \n" \ | |||
| " ldp q22, q23, ["X", #96] \n" \ | |||
| " ldp q30, q31, ["Y", #96] \n" \ | |||
| " add "Y", "Y", #128 \n" \ | |||
| " add "X", "X", #128 \n" \ | |||
| " ldp q22, q23, [%[X_], #96] \n" \ | |||
| " ldp q30, q31, [%[Y_], #96] \n" \ | |||
| " add %[Y_], %[Y_], #128 \n" \ | |||
| " add %[X_], %[X_], #128 \n" \ | |||
| " fmla v4.2d, v20.2d, v28.2d \n" \ | |||
| " fmla v5.2d, v21.2d, v29.2d \n" \ | |||
| " PRFM PLDL1KEEP, ["X", #896] \n" \ | |||
| " PRFM PLDL1KEEP, ["Y", #896] \n" \ | |||
| " PRFM PLDL1KEEP, ["X", #896+64] \n" \ | |||
| " PRFM PLDL1KEEP, ["Y", #896+64] \n" \ | |||
| " PRFM PLDL1KEEP, [%[X_], #896] \n" \ | |||
| " PRFM PLDL1KEEP, [%[Y_], #896] \n" \ | |||
| " PRFM PLDL1KEEP, [%[X_], #896+64] \n" \ | |||
| " PRFM PLDL1KEEP, [%[Y_], #896+64] \n" \ | |||
| " fmla v6.2d, v22.2d, v30.2d \n" \ | |||
| " fmla v7.2d, v23.2d, v31.2d \n" | |||
| @@ -261,28 +257,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| " fadd v0.2d, v0.2d, v2.2d \n" \ | |||
| " fadd v4.2d, v4.2d, v6.2d \n" \ | |||
| " fadd v0.2d, v0.2d, v4.2d \n" \ | |||
| " faddp "DOTF", v0.2d \n" | |||
| " faddp "OUT", v0.2d \n" | |||
| #endif /* !defined(DOUBLE) */ | |||
| #if defined(SMP) | |||
| extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, | |||
| BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, | |||
| void *c, BLASLONG ldc, int (*function)(), int nthreads); | |||
| #endif | |||
| static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| static RETURN_TYPE dot_kernel_asimd(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| RETURN_TYPE dot = 0.0 ; | |||
| if ( n < 0 ) return dot; | |||
| RETURN_TYPE dot = 0.0; | |||
| BLASLONG j = 0; | |||
| __asm__ __volatile__ ( | |||
| " mov "N", %[N_] \n" | |||
| " mov "X", %[X_] \n" | |||
| " mov "INC_X", %[INCX_] \n" | |||
| " mov "Y", %[Y_] \n" | |||
| " mov "INC_Y", %[INCY_] \n" | |||
| " fmov "DOTF", "REG0" \n" | |||
| " fmov "OUT", "REG0" \n" | |||
| " fmov d1, xzr \n" | |||
| " fmov d2, xzr \n" | |||
| " fmov d3, xzr \n" | |||
| @@ -290,42 +274,40 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B | |||
| " fmov d5, xzr \n" | |||
| " fmov d6, xzr \n" | |||
| " fmov d7, xzr \n" | |||
| " cmp "N", xzr \n" | |||
| " ble 9f //dot_kernel_L999 \n" | |||
| " cmp "INC_X", #1 \n" | |||
| " cmp %[INCX_], #1 \n" | |||
| " bne 5f //dot_kernel_S_BEGIN \n" | |||
| " cmp "INC_Y", #1 \n" | |||
| " cmp %[INCY_], #1 \n" | |||
| " bne 5f //dot_kernel_S_BEGIN \n" | |||
| "1: //dot_kernel_F_BEGIN: \n" | |||
| " lsl "INC_X", "INC_X", "INC_SHIFT" \n" | |||
| " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" | |||
| " asr "J", "N", #"N_DIV_SHIFT" \n" | |||
| " cmp "J", xzr \n" | |||
| " lsl %[INCX_], %[INCX_], "INC_SHIFT" \n" | |||
| " lsl %[INCY_], %[INCY_], "INC_SHIFT" \n" | |||
| " asr %[J_], %[N_], #"N_DIV_SHIFT" \n" | |||
| " cmp %[J_], xzr \n" | |||
| " beq 3f //dot_kernel_F1 \n" | |||
| " .align 5 \n" | |||
| "2: //dot_kernel_F: \n" | |||
| " "KERNEL_F" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " subs %[J_], %[J_], #1 \n" | |||
| " bne 2b //dot_kernel_F \n" | |||
| " "KERNEL_F_FINALIZE" \n" | |||
| "3: //dot_kernel_F1: \n" | |||
| " ands "J", "N", #"N_REM_MASK" \n" | |||
| " ands %[J_], %[N_], #"N_REM_MASK" \n" | |||
| " ble 9f //dot_kernel_L999 \n" | |||
| "4: //dot_kernel_F10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " subs %[J_], %[J_], #1 \n" | |||
| " bne 4b //dot_kernel_F10 \n" | |||
| " b 9f //dot_kernel_L999 \n" | |||
| "5: //dot_kernel_S_BEGIN: \n" | |||
| " lsl "INC_X", "INC_X", "INC_SHIFT" \n" | |||
| " lsl "INC_Y", "INC_Y", "INC_SHIFT" \n" | |||
| " asr "J", "N", #2 \n" | |||
| " cmp "J", xzr \n" | |||
| " lsl %[INCX_], %[INCX_], "INC_SHIFT" \n" | |||
| " lsl %[INCY_], %[INCY_], "INC_SHIFT" \n" | |||
| " asr %[J_], %[N_], #2 \n" | |||
| " cmp %[J_], xzr \n" | |||
| " ble 7f //dot_kernel_S1 \n" | |||
| "6: //dot_kernel_S4: \n" | |||
| @@ -333,88 +315,31 @@ static RETURN_TYPE dot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, B | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " subs %[J_], %[J_], #1 \n" | |||
| " bne 6b //dot_kernel_S4 \n" | |||
| "7: //dot_kernel_S1: \n" | |||
| " ands "J", "N", #3 \n" | |||
| " ands %[J_], %[N_], #3 \n" | |||
| " ble 9f //dot_kernel_L999 \n" | |||
| "8: //dot_kernel_S10: \n" | |||
| " "KERNEL_F1" \n" | |||
| " subs "J", "J", #1 \n" | |||
| " subs %[J_], %[J_], #1 \n" | |||
| " bne 8b //dot_kernel_S10 \n" | |||
| "9: //dot_kernel_L999: \n" | |||
| " str "DOTF", [%[DOT_]] \n" | |||
| : | |||
| : [DOT_] "r" (&dot), //%0 | |||
| [N_] "r" (n), //%1 | |||
| [X_] "r" (x), //%2 | |||
| [INCX_] "r" (inc_x), //%3 | |||
| [Y_] "r" (y), //%4 | |||
| [INCY_] "r" (inc_y) //%5 | |||
| : [DOT_] "=&w" (dot) | |||
| : [N_] "r" (n), | |||
| [X_] "r" (x), | |||
| [INCX_] "r" (inc_x), | |||
| [Y_] "r" (y), | |||
| [INCY_] "r" (inc_y), | |||
| [J_] "r" (j) | |||
| : "cc", | |||
| "memory", | |||
| "x0", "x1", "x2", "x3", "x4", "x5", | |||
| "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" | |||
| "d1", "d2", "d3", "d4", "d5", "d6", "d7" | |||
| ); | |||
| return dot; | |||
| } | |||
| #if defined(SMP) | |||
| static int dot_thread_function(BLASLONG n, BLASLONG dummy0, | |||
| BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y, | |||
| BLASLONG inc_y, FLOAT *result, BLASLONG dummy3) | |||
| { | |||
| *(RETURN_TYPE *)result = dot_compute(n, x, inc_x, y, inc_y); | |||
| return 0; | |||
| } | |||
| #endif | |||
| RETURN_TYPE CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| #if defined(SMP) | |||
| int nthreads; | |||
| FLOAT dummy_alpha; | |||
| #endif | |||
| RETURN_TYPE dot = 0.0; | |||
| #if defined(SMP) | |||
| if (inc_x == 0 || inc_y == 0 || n <= 10000) | |||
| nthreads = 1; | |||
| else | |||
| nthreads = num_cpu_avail(1); | |||
| if (nthreads == 1) { | |||
| dot = dot_compute(n, x, inc_x, y, inc_y); | |||
| } else { | |||
| int mode, i; | |||
| char result[MAX_CPU_NUMBER * sizeof(double) * 2]; | |||
| RETURN_TYPE *ptr; | |||
| #if !defined(DOUBLE) | |||
| mode = BLAS_SINGLE | BLAS_REAL; | |||
| #else | |||
| mode = BLAS_DOUBLE | BLAS_REAL; | |||
| #endif | |||
| blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, | |||
| x, inc_x, y, inc_y, result, 0, | |||
| ( void *)dot_thread_function, nthreads); | |||
| ptr = (RETURN_TYPE *)result; | |||
| for (i = 0; i < nthreads; i++) { | |||
| dot = dot + (*ptr); | |||
| ptr = (RETURN_TYPE *)(((char *)ptr) + sizeof(double) * 2); | |||
| } | |||
| } | |||
| #else | |||
| dot = dot_compute(n, x, inc_x, y, inc_y); | |||
| #endif | |||
| return dot; | |||
| } | |||
| @@ -0,0 +1,66 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2022, Arm Ltd | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_sve.h> | |||
| #ifdef DOUBLE | |||
| #define SVE_TYPE svfloat64_t | |||
| #define SVE_ZERO svdup_f64(0.0) | |||
| #define SVE_WHILELT svwhilelt_b64 | |||
| #define SVE_ALL svptrue_b64() | |||
| #define SVE_WIDTH svcntd() | |||
| #else | |||
| #define SVE_TYPE svfloat32_t | |||
| #define SVE_ZERO svdup_f32(0.0) | |||
| #define SVE_WHILELT svwhilelt_b32 | |||
| #define SVE_ALL svptrue_b32() | |||
| #define SVE_WIDTH svcntw() | |||
| #endif | |||
| static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) { | |||
| SVE_TYPE acc_a = SVE_ZERO; | |||
| SVE_TYPE acc_b = SVE_ZERO; | |||
| BLASLONG sve_width = SVE_WIDTH; | |||
| for (BLASLONG i = 0; i < n; i += sve_width * 2) { | |||
| svbool_t pg_a = SVE_WHILELT(i, n); | |||
| svbool_t pg_b = SVE_WHILELT(i + sve_width, n); | |||
| SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); | |||
| SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); | |||
| SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); | |||
| SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); | |||
| acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a); | |||
| acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b); | |||
| } | |||
| return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b); | |||
| } | |||
| @@ -37,9 +37,9 @@ | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT *A, IFLOAT *B, | |||
| FLOAT *C, BLASLONG ldc) { | |||
| if (alpha == 1.0f) | |||
| return sbgemm_kernel_neoversen2_alpha_one(m, n, k, alpha, A, B, C, ldc); | |||
| else | |||
| return sbgemm_kernel_neoversen2_alpha(m, n, k, alpha, A, B, C, ldc); | |||
| return 0; | |||
| if (alpha == 1.0f) | |||
| return sbgemm_kernel_neoversen2_alpha_one(m, n, k, alpha, A, B, C, ldc); | |||
| else | |||
| return sbgemm_kernel_neoversen2_alpha(m, n, k, alpha, A, B, C, ldc); | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,126 @@ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2022, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
| * POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
| IFLOAT *a_offset; | |||
| IFLOAT *a_offsetx[4]; | |||
| IFLOAT *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| svbool_t pg16 = svdupq_b16(1, 1, 1, 1, 0, 0, 0, 0); | |||
| svbfloat16_t v0, v1, v2, v3; | |||
| for (BLASLONG j = 0; j < n / 4; j++) { | |||
| a_offsetx[0] = a_offset; | |||
| a_offsetx[1] = a_offsetx[0] + lda; | |||
| a_offsetx[2] = a_offsetx[1] + lda; | |||
| a_offsetx[3] = a_offsetx[2] + lda; | |||
| a_offset += 4 * lda; | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); | |||
| v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); | |||
| v2 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[2]); | |||
| v3 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[3]); | |||
| svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); | |||
| svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); | |||
| svst1_bf16(pg16, (bfloat16_t *)b_offset + 8, v2); | |||
| svst1_bf16(pg16, (bfloat16_t *)b_offset + 12, v3); | |||
| b_offset += 16; | |||
| a_offsetx[0] += 4; | |||
| a_offsetx[1] += 4; | |||
| a_offsetx[2] += 4; | |||
| a_offsetx[3] += 4; | |||
| } | |||
| if (m & 3) { | |||
| BLASLONG rest = m & 3; | |||
| for (BLASLONG col = 0; col < 4; col++) { | |||
| b_offset[4 * col] = a_offsetx[col][0]; | |||
| b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; | |||
| b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; | |||
| b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; | |||
| } | |||
| b_offset += 16; | |||
| } | |||
| } | |||
| if (n & 2) { | |||
| a_offsetx[0] = a_offset; | |||
| a_offsetx[1] = a_offsetx[0] + lda; | |||
| a_offset += 2 * lda; | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); | |||
| v1 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[1]); | |||
| svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); | |||
| svst1_bf16(pg16, (bfloat16_t *)b_offset + 4, v1); | |||
| b_offset += 8; | |||
| a_offsetx[0] += 4; | |||
| a_offsetx[1] += 4; | |||
| } | |||
| if (m & 3) { | |||
| BLASLONG rest = m & 3; | |||
| for (BLASLONG col = 0; col < 2; col++) { | |||
| b_offset[4 * col] = a_offsetx[col][0]; | |||
| b_offset[4 * col + 1] = rest == 1 ? 0 : a_offsetx[col][1]; | |||
| b_offset[4 * col + 2] = rest <= 2 ? 0 : a_offsetx[col][2]; | |||
| b_offset[4 * col + 3] = rest <= 3 ? 0 : a_offsetx[col][3]; | |||
| } | |||
| b_offset += 8; | |||
| } | |||
| } | |||
| if (n & 1) { | |||
| a_offsetx[0] = a_offset; | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| v0 = svld1_bf16(pg16, (bfloat16_t *)a_offsetx[0]); | |||
| svst1_bf16(pg16, (bfloat16_t *)b_offset, v0); | |||
| b_offset += 4; | |||
| a_offsetx[0] += 4; | |||
| } | |||
| if (m & 3) { | |||
| BLASLONG rest = m & 3; | |||
| b_offset[0] = a_offsetx[0][0]; | |||
| b_offset[1] = rest == 1 ? 0 : a_offsetx[0][1]; | |||
| b_offset[2] = rest <= 2 ? 0 : a_offsetx[0][2]; | |||
| b_offset[3] = rest <= 3 ? 0 : a_offsetx[0][3]; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,101 +0,0 @@ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2022, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
| * POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
| IFLOAT *a_offset, *a_offset1, *a_offset2; | |||
| IFLOAT *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| for (BLASLONG j = 0; j < n / 2; j++) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset += 2 * lda; | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset1 + 1); | |||
| *(b_offset + 2) = *(a_offset1 + 2); | |||
| *(b_offset + 3) = *(a_offset1 + 3); | |||
| *(b_offset + 4) = *(a_offset2 + 0); | |||
| *(b_offset + 5) = *(a_offset2 + 1); | |||
| *(b_offset + 6) = *(a_offset2 + 2); | |||
| *(b_offset + 7) = *(a_offset2 + 3); | |||
| a_offset1 += 4; | |||
| a_offset2 += 4; | |||
| b_offset += 8; | |||
| } | |||
| BLASLONG rest = m & 3; | |||
| if (rest == 3) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset1 + 1); | |||
| *(b_offset + 2) = *(a_offset1 + 2); | |||
| *(b_offset + 3) = *(a_offset2 + 0); | |||
| *(b_offset + 4) = *(a_offset2 + 1); | |||
| *(b_offset + 5) = *(a_offset2 + 2); | |||
| b_offset += 6; | |||
| } else if (rest == 2) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset1 + 1); | |||
| *(b_offset + 2) = *(a_offset2 + 0); | |||
| *(b_offset + 3) = *(a_offset2 + 1); | |||
| b_offset += 4; | |||
| } else if (rest == 1) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset2 + 0); | |||
| b_offset += 2; | |||
| } | |||
| } | |||
| if (n & 1) { | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| *(b_offset + 0) = *(a_offset + 0); | |||
| *(b_offset + 1) = *(a_offset + 1); | |||
| *(b_offset + 2) = *(a_offset + 2); | |||
| *(b_offset + 3) = *(a_offset + 3); | |||
| b_offset += 4; | |||
| a_offset += 4; | |||
| } | |||
| BLASLONG rest = m & 3; | |||
| if (rest == 3) { | |||
| *(b_offset + 0) = *(a_offset + 0); | |||
| *(b_offset + 1) = *(a_offset + 1); | |||
| *(b_offset + 2) = *(a_offset + 2); | |||
| } else if (rest == 2) { | |||
| *(b_offset + 0) = *(a_offset + 0); | |||
| *(b_offset + 1) = *(a_offset + 1); | |||
| } else if (rest == 1) { | |||
| *(b_offset + 0) = *(a_offset + 0); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,165 @@ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2022, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
| * POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
| IFLOAT *a_offset, *a_offset0, *a_offset1, *a_offset2, *a_offset3; | |||
| IFLOAT *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| for (BLASLONG j = 0; j < n / 8; j++) { | |||
| a_offset0 = a_offset; | |||
| a_offset1 = a_offset0 + lda; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset += 8; | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| for (BLASLONG line = 0; line < 8; line++) { | |||
| b_offset[line * 4] = a_offset0[line]; | |||
| b_offset[line * 4 + 1] = a_offset1[line]; | |||
| b_offset[line * 4 + 2] = a_offset2[line]; | |||
| b_offset[line * 4 + 3] = a_offset3[line]; | |||
| } | |||
| b_offset += 32; | |||
| a_offset0 += 4 * lda; | |||
| a_offset1 += 4 * lda; | |||
| a_offset2 += 4 * lda; | |||
| a_offset3 += 4 * lda; | |||
| } | |||
| if (m & 3) { | |||
| BLASLONG rest = m & 3; | |||
| for (BLASLONG line = 0; line < 8; line++) { | |||
| b_offset[line * 4] = a_offset0[line]; | |||
| b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; | |||
| b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; | |||
| b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; | |||
| } | |||
| b_offset += 32; | |||
| } | |||
| } | |||
| if (n & 4) { | |||
| a_offset0 = a_offset; | |||
| a_offset1 = a_offset0 + lda; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset += 4; | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| for (BLASLONG line = 0; line < 4; line++) { | |||
| b_offset[line * 4] = a_offset0[line]; | |||
| b_offset[line * 4 + 1] = a_offset1[line]; | |||
| b_offset[line * 4 + 2] = a_offset2[line]; | |||
| b_offset[line * 4 + 3] = a_offset3[line]; | |||
| } | |||
| b_offset += 16; | |||
| a_offset0 += 4 * lda; | |||
| a_offset1 += 4 * lda; | |||
| a_offset2 += 4 * lda; | |||
| a_offset3 += 4 * lda; | |||
| } | |||
| if (m & 3) { | |||
| BLASLONG rest = m & 3; | |||
| for (BLASLONG line = 0; line < 4; line++) { | |||
| b_offset[line * 4] = a_offset0[line]; | |||
| b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; | |||
| b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; | |||
| b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; | |||
| } | |||
| b_offset += 16; | |||
| } | |||
| } | |||
| if (n & 2) { | |||
| a_offset0 = a_offset; | |||
| a_offset1 = a_offset0 + lda; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset += 2; | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| for (BLASLONG line = 0; line < 2; line++) { | |||
| b_offset[line * 4] = a_offset0[line]; | |||
| b_offset[line * 4 + 1] = a_offset1[line]; | |||
| b_offset[line * 4 + 2] = a_offset2[line]; | |||
| b_offset[line * 4 + 3] = a_offset3[line]; | |||
| } | |||
| b_offset += 8; | |||
| a_offset0 += 4 * lda; | |||
| a_offset1 += 4 * lda; | |||
| a_offset2 += 4 * lda; | |||
| a_offset3 += 4 * lda; | |||
| } | |||
| if (m & 3) { | |||
| BLASLONG rest = m & 3; | |||
| for (BLASLONG line = 0; line < 2; line++) { | |||
| b_offset[line * 4] = a_offset0[line]; | |||
| b_offset[line * 4 + 1] = rest == 1 ? 0 : a_offset1[line]; | |||
| b_offset[line * 4 + 2] = rest <= 2 ? 0 : a_offset2[line]; | |||
| b_offset[line * 4 + 3] = rest <= 3 ? 0 : a_offset3[line]; | |||
| } | |||
| b_offset += 8; | |||
| } | |||
| } | |||
| if (n & 1) { | |||
| a_offset0 = a_offset; | |||
| a_offset1 = a_offset0 + lda; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| b_offset[0] = *a_offset0; | |||
| b_offset[1] = *a_offset1; | |||
| b_offset[2] = *a_offset2; | |||
| b_offset[3] = *a_offset3; | |||
| b_offset += 4; | |||
| a_offset0 += 4 * lda; | |||
| a_offset1 += 4 * lda; | |||
| a_offset2 += 4 * lda; | |||
| a_offset3 += 4 * lda; | |||
| } | |||
| if (m & 3) { | |||
| BLASLONG rest = m & 3; | |||
| b_offset[0] = *a_offset0; | |||
| b_offset[1] = rest == 1 ? 0 : *a_offset1; | |||
| b_offset[2] = rest <= 2 ? 0 : *a_offset2; | |||
| b_offset[3] = rest <= 3 ? 0 : *a_offset3; | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,109 +0,0 @@ | |||
| /*************************************************************************** | |||
| * Copyright (c) 2022, The OpenBLAS Project | |||
| * All rights reserved. | |||
| * Redistribution and use in source and binary forms, with or without | |||
| * modification, are permitted provided that the following conditions are | |||
| * met: | |||
| * 1. Redistributions of source code must retain the above copyright | |||
| * notice, this list of conditions and the following disclaimer. | |||
| * 2. Redistributions in binary form must reproduce the above copyright | |||
| * notice, this list of conditions and the following disclaimer in | |||
| * the documentation and/or other materials provided with the | |||
| * distribution. | |||
| * 3. Neither the name of the OpenBLAS project nor the names of | |||
| * its contributors may be used to endorse or promote products | |||
| * derived from this software without specific prior written permission. | |||
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| * ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF | |||
| * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS | |||
| * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN | |||
| * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) | |||
| * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |||
| * POSSIBILITY OF SUCH DAMAGE. | |||
| * *****************************************************************************/ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
| IFLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; | |||
| IFLOAT *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| for (BLASLONG j = 0; j < n / 2; j++) { | |||
| a_offset1 = a_offset; | |||
| a_offset2 = a_offset1 + lda; | |||
| a_offset3 = a_offset2 + lda; | |||
| a_offset4 = a_offset3 + lda; | |||
| a_offset += 2; | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset2 + 0); | |||
| *(b_offset + 2) = *(a_offset3 + 0); | |||
| *(b_offset + 3) = *(a_offset4 + 0); | |||
| *(b_offset + 4) = *(a_offset1 + 1); | |||
| *(b_offset + 5) = *(a_offset2 + 1); | |||
| *(b_offset + 6) = *(a_offset3 + 1); | |||
| *(b_offset + 7) = *(a_offset4 + 1); | |||
| b_offset += 8; | |||
| a_offset1 += 4 * lda; | |||
| a_offset2 += 4 * lda; | |||
| a_offset3 += 4 * lda; | |||
| a_offset4 += 4 * lda; | |||
| } | |||
| if (m & 3) { | |||
| BLASLONG rest = m & 3; | |||
| if (rest == 3) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset2 + 0); | |||
| *(b_offset + 2) = *(a_offset3 + 0); | |||
| *(b_offset + 3) = *(a_offset1 + 1); | |||
| *(b_offset + 4) = *(a_offset2 + 1); | |||
| *(b_offset + 5) = *(a_offset3 + 1); | |||
| b_offset += 6; | |||
| } else if (rest == 2) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset2 + 0); | |||
| *(b_offset + 2) = *(a_offset1 + 1); | |||
| *(b_offset + 3) = *(a_offset2 + 1); | |||
| b_offset += 4; | |||
| } else if (rest == 1) { | |||
| *(b_offset + 0) = *(a_offset1 + 0); | |||
| *(b_offset + 1) = *(a_offset1 + 1); | |||
| b_offset += 2; | |||
| } | |||
| } | |||
| } | |||
| if (n & 1) { | |||
| for (BLASLONG i = 0; i < m / 4; i++) { | |||
| *(b_offset + 0) = *(a_offset); | |||
| *(b_offset + 1) = *(a_offset + lda); | |||
| *(b_offset + 2) = *(a_offset + lda * 2); | |||
| *(b_offset + 3) = *(a_offset + lda * 3); | |||
| b_offset += 4; | |||
| a_offset += 4 * lda; | |||
| } | |||
| BLASLONG rest = m & 3; | |||
| if (rest == 3) { | |||
| *(b_offset + 0) = *(a_offset); | |||
| *(b_offset + 1) = *(a_offset + lda); | |||
| *(b_offset + 2) = *(a_offset + lda * 2); | |||
| } else if (rest == 2) { | |||
| *(b_offset + 0) = *(a_offset); | |||
| *(b_offset + 1) = *(a_offset + lda); | |||
| } else if (rest == 1) { | |||
| *(b_offset + 0) = *(a_offset); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,333 +1,333 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A00 x2 | |||
| #define LDA x3 | |||
| #define B00 x4 | |||
| #define A01 x5 | |||
| #define A02 x6 | |||
| #define A03 x7 | |||
| #define A04 x8 | |||
| #define I x9 | |||
| #define J x10 | |||
| #define TEMP1 x11 | |||
| #define TEMP2 x12 | |||
| #define A_PREFETCH 2560 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro COPY4x4 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| ldr q2, [A03], #16 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v9.s[2], v2.s[1] | |||
| ins v10.s[2], v2.s[2] | |||
| ins v11.s[2], v2.s[3] | |||
| ldr q3, [A04], #16 | |||
| ins v8.s[3], v3.s[0] | |||
| ins v9.s[3], v3.s[1] | |||
| ins v10.s[3], v3.s[2] | |||
| ins v11.s[3], v3.s[3] | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x4 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| ldr s2, [A03], #4 | |||
| ldr s3, [A04], #4 | |||
| stp s0, s1, [B00] | |||
| add B00, B00, #8 | |||
| stp s2, s3, [B00] | |||
| add B00, B00, #8 | |||
| .endm | |||
| .macro COPY4x2 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] | |||
| add B00, B00, #32 | |||
| .endm | |||
| .macro COPY1x2 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| stp s0, s1, [B00] | |||
| add B00, B00, #8 | |||
| .endm | |||
| .macro COPY4x1 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| str q0, [B00], #16 | |||
| .endm | |||
| .macro COPY1x1 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| str s0, [B00], #4 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #2 // LDA = LDA * SIZE | |||
| .Ldgemm_ncopy_L4_BEGIN: | |||
| asr J, N, #2 // J = N / 4 | |||
| cmp J, #0 | |||
| ble .Ldgemm_ncopy_L2_BEGIN | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A00, A04, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L4_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_20: | |||
| COPY4x4 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L4_M4_20 | |||
| .Ldgemm_ncopy_L4_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L4_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_60: | |||
| COPY1x4 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L4_M4_60 | |||
| .Ldgemm_ncopy_L4_M4_END: | |||
| subs J , J, #1 // j-- | |||
| bne .Ldgemm_ncopy_L4_M4_BEGIN | |||
| /*********************************************************************************************/ | |||
| .Ldgemm_ncopy_L2_BEGIN: | |||
| tst N, #3 | |||
| ble .Ldgemm_ncopy_L999 | |||
| tst N, #2 | |||
| ble .Ldgemm_ncopy_L1_BEGIN | |||
| .Ldgemm_ncopy_L2_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A00, A02, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L2_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L2_M4_20: | |||
| COPY4x2 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L2_M4_20 | |||
| .Ldgemm_ncopy_L2_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L2_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L2_M4_60: | |||
| COPY1x2 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L2_M4_60 | |||
| .Ldgemm_ncopy_L2_M4_END: | |||
| /*********************************************************************************************/ | |||
| .Ldgemm_ncopy_L1_BEGIN: | |||
| tst N, #1 | |||
| ble .Ldgemm_ncopy_L999 | |||
| .Ldgemm_ncopy_L1_M4_BEGIN: | |||
| mov A01, A00 | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L1_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L1_M4_20: | |||
| COPY4x1 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L1_M4_20 | |||
| .Ldgemm_ncopy_L1_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L1_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L1_M4_60: | |||
| COPY1x1 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L1_M4_60 | |||
| .Ldgemm_ncopy_L1_M4_END: | |||
| .Ldgemm_ncopy_L999: | |||
| mov x0, #0 | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A00 x2 | |||
| #define LDA x3 | |||
| #define B00 x4 | |||
| #define A01 x5 | |||
| #define A02 x6 | |||
| #define A03 x7 | |||
| #define A04 x8 | |||
| #define I x9 | |||
| #define J x10 | |||
| #define TEMP1 x11 | |||
| #define TEMP2 x12 | |||
| #define A_PREFETCH 2560 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro COPY4x4 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| ldr q2, [A03], #16 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v9.s[2], v2.s[1] | |||
| ins v10.s[2], v2.s[2] | |||
| ins v11.s[2], v2.s[3] | |||
| ldr q3, [A04], #16 | |||
| ins v8.s[3], v3.s[0] | |||
| ins v9.s[3], v3.s[1] | |||
| ins v10.s[3], v3.s[2] | |||
| ins v11.s[3], v3.s[3] | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x4 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| ldr s2, [A03], #4 | |||
| ldr s3, [A04], #4 | |||
| stp s0, s1, [B00] | |||
| add B00, B00, #8 | |||
| stp s2, s3, [B00] | |||
| add B00, B00, #8 | |||
| .endm | |||
| .macro COPY4x2 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] | |||
| add B00, B00, #32 | |||
| .endm | |||
| .macro COPY1x2 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| stp s0, s1, [B00] | |||
| add B00, B00, #8 | |||
| .endm | |||
| .macro COPY4x1 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| str q0, [B00], #16 | |||
| .endm | |||
| .macro COPY1x1 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| str s0, [B00], #4 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #2 // LDA = LDA * SIZE | |||
| .Ldgemm_ncopy_L4_BEGIN: | |||
| asr J, N, #2 // J = N / 4 | |||
| cmp J, #0 | |||
| ble .Ldgemm_ncopy_L2_BEGIN | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A00, A04, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L4_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_20: | |||
| COPY4x4 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L4_M4_20 | |||
| .Ldgemm_ncopy_L4_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L4_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_60: | |||
| COPY1x4 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L4_M4_60 | |||
| .Ldgemm_ncopy_L4_M4_END: | |||
| subs J , J, #1 // j-- | |||
| bne .Ldgemm_ncopy_L4_M4_BEGIN | |||
| /*********************************************************************************************/ | |||
| .Ldgemm_ncopy_L2_BEGIN: | |||
| tst N, #3 | |||
| ble .Ldgemm_ncopy_L999 | |||
| tst N, #2 | |||
| ble .Ldgemm_ncopy_L1_BEGIN | |||
| .Ldgemm_ncopy_L2_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A00, A02, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L2_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L2_M4_20: | |||
| COPY4x2 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L2_M4_20 | |||
| .Ldgemm_ncopy_L2_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L2_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L2_M4_60: | |||
| COPY1x2 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L2_M4_60 | |||
| .Ldgemm_ncopy_L2_M4_END: | |||
| /*********************************************************************************************/ | |||
| .Ldgemm_ncopy_L1_BEGIN: | |||
| tst N, #1 | |||
| ble .Ldgemm_ncopy_L999 | |||
| .Ldgemm_ncopy_L1_M4_BEGIN: | |||
| mov A01, A00 | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L1_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L1_M4_20: | |||
| COPY4x1 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L1_M4_20 | |||
| .Ldgemm_ncopy_L1_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L1_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L1_M4_60: | |||
| COPY1x1 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L1_M4_60 | |||
| .Ldgemm_ncopy_L1_M4_END: | |||
| .Ldgemm_ncopy_L999: | |||
| mov x0, #0 | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| @@ -39,10 +39,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| FLOAT x0, x1, x2, x3, y0, y1, y2, y3; | |||
| v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; | |||
| v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; | |||
| #if defined(DSDOT) | |||
| v2f64 dvx0, dvx1, dvx2, dvx3, dvx4, dvx5, dvx6, dvx7; | |||
| v2f64 dvy0, dvy1, dvy2, dvy3, dvy4, dvy5, dvy6, dvy7; | |||
| v2f64 dot0 = {0, 0}; | |||
| v2f64 dot1 = {0, 0}; | |||
| v2f64 dot2 = {0, 0}; | |||
| v2f64 dot3 = {0, 0}; | |||
| #else | |||
| v4f32 dot0 = {0, 0, 0, 0}; | |||
| v4f32 dot1 = {0, 0, 0, 0}; | |||
| v4f32 dot2 = {0, 0, 0, 0}; | |||
| v4f32 dot3 = {0, 0, 0, 0}; | |||
| #endif | |||
| if (n < 1) return (dot); | |||
| @@ -83,6 +92,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| x_pref += 32; | |||
| y_pref += 32; | |||
| #if defined(DSDOT) | |||
| /* Extend single precision to double precision */ | |||
| dvy0 = __msa_fexupr_d(vy0); | |||
| dvy1 = __msa_fexupr_d(vy1); | |||
| dvy2 = __msa_fexupr_d(vy2); | |||
| dvy3 = __msa_fexupr_d(vy3); | |||
| dvy4 = __msa_fexupr_d(vy4); | |||
| dvy5 = __msa_fexupr_d(vy5); | |||
| dvy6 = __msa_fexupr_d(vy6); | |||
| dvy7 = __msa_fexupr_d(vy7); | |||
| vy0 = (v4f32)__msa_fexupl_d(vy0); | |||
| vy1 = (v4f32)__msa_fexupl_d(vy1); | |||
| vy2 = (v4f32)__msa_fexupl_d(vy2); | |||
| vy3 = (v4f32)__msa_fexupl_d(vy3); | |||
| vy4 = (v4f32)__msa_fexupl_d(vy4); | |||
| vy5 = (v4f32)__msa_fexupl_d(vy5); | |||
| vy6 = (v4f32)__msa_fexupl_d(vy6); | |||
| vy7 = (v4f32)__msa_fexupl_d(vy7); | |||
| dvx0 = __msa_fexupr_d(vx0); | |||
| dvx1 = __msa_fexupr_d(vx1); | |||
| dvx2 = __msa_fexupr_d(vx2); | |||
| dvx3 = __msa_fexupr_d(vx3); | |||
| dvx4 = __msa_fexupr_d(vx4); | |||
| dvx5 = __msa_fexupr_d(vx5); | |||
| dvx6 = __msa_fexupr_d(vx6); | |||
| dvx7 = __msa_fexupr_d(vx7); | |||
| vx0 = (v4f32)__msa_fexupl_d(vx0); | |||
| vx1 = (v4f32)__msa_fexupl_d(vx1); | |||
| vx2 = (v4f32)__msa_fexupl_d(vx2); | |||
| vx3 = (v4f32)__msa_fexupl_d(vx3); | |||
| vx4 = (v4f32)__msa_fexupl_d(vx4); | |||
| vx5 = (v4f32)__msa_fexupl_d(vx5); | |||
| vx6 = (v4f32)__msa_fexupl_d(vx6); | |||
| vx7 = (v4f32)__msa_fexupl_d(vx7); | |||
| dot0 += (dvy0 * dvx0); | |||
| dot1 += (dvy1 * dvx1); | |||
| dot2 += (dvy2 * dvx2); | |||
| dot3 += (dvy3 * dvx3); | |||
| dot0 += (dvy4 * dvx4); | |||
| dot1 += (dvy5 * dvx5); | |||
| dot2 += (dvy6 * dvx6); | |||
| dot3 += (dvy7 * dvx7); | |||
| dot0 += ((v2f64)vy0 * (v2f64)vx0); | |||
| dot1 += ((v2f64)vy1 * (v2f64)vx1); | |||
| dot2 += ((v2f64)vy2 * (v2f64)vx2); | |||
| dot3 += ((v2f64)vy3 * (v2f64)vx3); | |||
| dot0 += ((v2f64)vy4 * (v2f64)vx4); | |||
| dot1 += ((v2f64)vy5 * (v2f64)vx5); | |||
| dot2 += ((v2f64)vy6 * (v2f64)vx6); | |||
| dot3 += ((v2f64)vy7 * (v2f64)vx7); | |||
| #else | |||
| dot0 += (vy0 * vx0); | |||
| dot1 += (vy1 * vx1); | |||
| dot2 += (vy2 * vx2); | |||
| @@ -91,6 +155,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| dot1 += (vy5 * vx5); | |||
| dot2 += (vy6 * vx6); | |||
| dot3 += (vy7 * vx7); | |||
| #endif | |||
| } | |||
| if (n & 31) | |||
| @@ -100,10 +165,41 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); | |||
| LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); | |||
| #if defined(DSDOT) | |||
| dvy0 = __msa_fexupr_d(vy0); | |||
| dvy1 = __msa_fexupr_d(vy1); | |||
| dvy2 = __msa_fexupr_d(vy2); | |||
| dvy3 = __msa_fexupr_d(vy3); | |||
| vy0 = (v4f32)__msa_fexupl_d(vy0); | |||
| vy1 = (v4f32)__msa_fexupl_d(vy1); | |||
| vy2 = (v4f32)__msa_fexupl_d(vy2); | |||
| vy3 = (v4f32)__msa_fexupl_d(vy3); | |||
| dvx0 = __msa_fexupr_d(vx0); | |||
| dvx1 = __msa_fexupr_d(vx1); | |||
| dvx2 = __msa_fexupr_d(vx2); | |||
| dvx3 = __msa_fexupr_d(vx3); | |||
| vx0 = (v4f32)__msa_fexupl_d(vx0); | |||
| vx1 = (v4f32)__msa_fexupl_d(vx1); | |||
| vx2 = (v4f32)__msa_fexupl_d(vx2); | |||
| vx3 = (v4f32)__msa_fexupl_d(vx3); | |||
| dot0 += (dvy0 * dvx0); | |||
| dot1 += (dvy1 * dvx1); | |||
| dot2 += (dvy2 * dvx2); | |||
| dot3 += (dvy3 * dvx3); | |||
| dot0 += ((v2f64)vy0 * (v2f64)vx0); | |||
| dot1 += ((v2f64)vy1 * (v2f64)vx1); | |||
| dot2 += ((v2f64)vy2 * (v2f64)vx2); | |||
| dot3 += ((v2f64)vy3 * (v2f64)vx3); | |||
| #else | |||
| dot0 += (vy0 * vx0); | |||
| dot1 += (vy1 * vx1); | |||
| dot2 += (vy2 * vx2); | |||
| dot3 += (vy3 * vx3); | |||
| #endif | |||
| } | |||
| if (n & 8) | |||
| @@ -111,8 +207,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| LD_SP2_INC(x, 4, vx0, vx1); | |||
| LD_SP2_INC(y, 4, vy0, vy1); | |||
| #if defined(DSDOT) | |||
| dvy0 = __msa_fexupr_d(vy0); | |||
| dvy1 = __msa_fexupr_d(vy1); | |||
| vy0 = (v4f32)__msa_fexupl_d(vy0); | |||
| vy1 = (v4f32)__msa_fexupl_d(vy1); | |||
| dvx0 = __msa_fexupr_d(vx0); | |||
| dvx1 = __msa_fexupr_d(vx1); | |||
| vx0 = (v4f32)__msa_fexupl_d(vx0); | |||
| vx1 = (v4f32)__msa_fexupl_d(vx1); | |||
| dot0 += (dvy0 * dvx0); | |||
| dot1 += (dvy1 * dvx1); | |||
| dot0 += ((v2f64)vy0 * (v2f64)vx0); | |||
| dot1 += ((v2f64)vy1 * (v2f64)vx1); | |||
| #else | |||
| dot0 += (vy0 * vx0); | |||
| dot1 += (vy1 * vx1); | |||
| #endif | |||
| } | |||
| if (n & 4) | |||
| @@ -120,7 +235,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| vx0 = LD_SP(x); x += 4; | |||
| vy0 = LD_SP(y); y += 4; | |||
| #if defined(DSDOT) | |||
| dvy0 = __msa_fexupr_d(vy0); | |||
| vy0 = (v4f32)__msa_fexupl_d(vy0); | |||
| dvx0 = __msa_fexupr_d(vx0); | |||
| vx0 = (v4f32)__msa_fexupl_d(vx0); | |||
| dot0 += (dvy0 * dvx0); | |||
| dot0 += ((v2f64)vy0 * (v2f64)vx0); | |||
| #else | |||
| dot0 += (vy0 * vx0); | |||
| #endif | |||
| } | |||
| if (n & 2) | |||
| @@ -128,8 +252,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| LD_GP2_INC(x, 1, x0, x1); | |||
| LD_GP2_INC(y, 1, y0, y1); | |||
| #if defined(DSDOT) | |||
| dot += ((double)y0 * (double)x0); | |||
| dot += ((double)y1 * (double)x1); | |||
| #else | |||
| dot += (y0 * x0); | |||
| dot += (y1 * x1); | |||
| #endif | |||
| } | |||
| if (n & 1) | |||
| @@ -137,7 +266,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| x0 = *x; | |||
| y0 = *y; | |||
| #if defined(DSDOT) | |||
| dot += ((double)y0 * (double)x0); | |||
| #else | |||
| dot += (y0 * x0); | |||
| #endif | |||
| } | |||
| } | |||
| @@ -145,8 +278,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| dot += dot0[0]; | |||
| dot += dot0[1]; | |||
| #if !defined(DSDOT) | |||
| dot += dot0[2]; | |||
| dot += dot0[3]; | |||
| #endif | |||
| } | |||
| else | |||
| { | |||
| @@ -155,10 +290,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| LD_GP4_INC(x, inc_x, x0, x1, x2, x3); | |||
| LD_GP4_INC(y, inc_y, y0, y1, y2, y3); | |||
| #if defined(DSDOT) | |||
| dot += ((double)y0 * (double)x0); | |||
| dot += ((double)y1 * (double)x1); | |||
| dot += ((double)y2 * (double)x2); | |||
| dot += ((double)y3 * (double)x3); | |||
| #else | |||
| dot += (y0 * x0); | |||
| dot += (y1 * x1); | |||
| dot += (y2 * x2); | |||
| dot += (y3 * x3); | |||
| #endif | |||
| } | |||
| if (n & 2) | |||
| @@ -166,8 +308,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| LD_GP2_INC(x, inc_x, x0, x1); | |||
| LD_GP2_INC(y, inc_y, y0, y1); | |||
| #if defined(DSDOT) | |||
| dot += ((double)y0 * (double)x0); | |||
| dot += ((double)y1 * (double)x1); | |||
| #else | |||
| dot += (y0 * x0); | |||
| dot += (y1 * x1); | |||
| #endif | |||
| } | |||
| if (n & 1) | |||
| @@ -175,7 +322,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| x0 = *x; | |||
| y0 = *y; | |||
| #if defined(DSDOT) | |||
| dot += ((double)y0 * (double)x0); | |||
| #else | |||
| dot += (y0 * x0); | |||
| #endif | |||
| } | |||
| } | |||
| @@ -0,0 +1,160 @@ | |||
| SGEMM_BETA = ../generic/gemm_beta.c | |||
| DGEMM_BETA = ../generic/gemm_beta.c | |||
| CGEMM_BETA = ../generic/zgemm_beta.c | |||
| ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| STRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| SGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| #Pure C for other kernels | |||
| SAMAXKERNEL = ../mips/amax.c | |||
| DAMAXKERNEL = ../mips/amax.c | |||
| CAMAXKERNEL = ../mips/zamax.c | |||
| ZAMAXKERNEL = ../mips/zamax.c | |||
| SAMINKERNEL = ../mips/amin.c | |||
| DAMINKERNEL = ../mips/amin.c | |||
| CAMINKERNEL = ../mips/zamin.c | |||
| ZAMINKERNEL = ../mips/zamin.c | |||
| SMAXKERNEL = ../mips/max.c | |||
| DMAXKERNEL = ../mips/max.c | |||
| SMINKERNEL = ../mips/min.c | |||
| DMINKERNEL = ../mips/min.c | |||
| ISAMAXKERNEL = ../mips/iamax.c | |||
| IDAMAXKERNEL = ../mips/iamax.c | |||
| ICAMAXKERNEL = ../mips/izamax.c | |||
| IZAMAXKERNEL = ../mips/izamax.c | |||
| ISAMINKERNEL = ../mips/iamin.c | |||
| IDAMINKERNEL = ../mips/iamin.c | |||
| ICAMINKERNEL = ../mips/izamin.c | |||
| IZAMINKERNEL = ../mips/izamin.c | |||
| ISMAXKERNEL = ../mips/imax.c | |||
| IDMAXKERNEL = ../mips/imax.c | |||
| ISMINKERNEL = ../mips/imin.c | |||
| IDMINKERNEL = ../mips/imin.c | |||
| SASUMKERNEL = ../mips/asum.c | |||
| DASUMKERNEL = ../mips/asum.c | |||
| CASUMKERNEL = ../mips/zasum.c | |||
| ZASUMKERNEL = ../mips/zasum.c | |||
| SSUMKERNEL = ../mips/sum.c | |||
| DSUMKERNEL = ../mips/sum.c | |||
| CSUMKERNEL = ../mips/zsum.c | |||
| ZSUMKERNEL = ../mips/zsum.c | |||
| SAXPYKERNEL = ../mips/axpy.c | |||
| DAXPYKERNEL = ../mips/axpy.c | |||
| CAXPYKERNEL = ../mips/zaxpy.c | |||
| ZAXPYKERNEL = ../mips/zaxpy.c | |||
| SCOPYKERNEL = ../mips/copy.c | |||
| DCOPYKERNEL = ../mips/copy.c | |||
| CCOPYKERNEL = ../mips/zcopy.c | |||
| ZCOPYKERNEL = ../mips/zcopy.c | |||
| SDOTKERNEL = ../mips/dot.c | |||
| DDOTKERNEL = ../mips/dot.c | |||
| CDOTKERNEL = ../mips/zdot.c | |||
| ZDOTKERNEL = ../mips/zdot.c | |||
| SNRM2KERNEL = ../mips/nrm2.c | |||
| DNRM2KERNEL = ../mips/nrm2.c | |||
| CNRM2KERNEL = ../mips/znrm2.c | |||
| ZNRM2KERNEL = ../mips/znrm2.c | |||
| SROTKERNEL = ../mips/rot.c | |||
| DROTKERNEL = ../mips/rot.c | |||
| CROTKERNEL = ../mips/zrot.c | |||
| ZROTKERNEL = ../mips/zrot.c | |||
| SSCALKERNEL = ../mips/scal.c | |||
| DSCALKERNEL = ../mips/scal.c | |||
| CSCALKERNEL = ../mips/zscal.c | |||
| ZSCALKERNEL = ../mips/zscal.c | |||
| SSWAPKERNEL = ../mips/swap.c | |||
| DSWAPKERNEL = ../mips/swap.c | |||
| CSWAPKERNEL = ../mips/zswap.c | |||
| ZSWAPKERNEL = ../mips/zswap.c | |||
| SGEMVNKERNEL = ../mips/gemv_n.c | |||
| DGEMVNKERNEL = ../mips/gemv_n.c | |||
| CGEMVNKERNEL = ../mips/zgemv_n.c | |||
| ZGEMVNKERNEL = ../mips/zgemv_n.c | |||
| SGEMVTKERNEL = ../mips/gemv_t.c | |||
| DGEMVTKERNEL = ../mips/gemv_t.c | |||
| CGEMVTKERNEL = ../mips/zgemv_t.c | |||
| ZGEMVTKERNEL = ../mips/zgemv_t.c | |||
| SSYMV_U_KERNEL = ../generic/symv_k.c | |||
| SSYMV_L_KERNEL = ../generic/symv_k.c | |||
| DSYMV_U_KERNEL = ../generic/symv_k.c | |||
| DSYMV_L_KERNEL = ../generic/symv_k.c | |||
| QSYMV_U_KERNEL = ../generic/symv_k.c | |||
| QSYMV_L_KERNEL = ../generic/symv_k.c | |||
| CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| XSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| XSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| ZHEMV_U_KERNEL = ../generic/zhemv_k.c | |||
| ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| @@ -90,7 +90,7 @@ | |||
| //Init INF | |||
| lui TEMP, 0x7FF0 | |||
| dsll TEMP, TEMP, 32 | |||
| MTC1 TEMP, INF | |||
| MTC TEMP, INF | |||
| LD a1, 0 * SIZE(X) | |||
| daddiu N, N, -1 | |||
| @@ -1,293 +1,293 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * Abdelrauf(quickwritereader@gmail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #define alpha_r vs19 | |||
| #define alpha_i vs20 | |||
| #define save_permute_1 vs21 | |||
| #define permute_mask vs22 | |||
| #define o0 0 | |||
| #define T1 r11 | |||
| #define T2 r12 | |||
| #define T3 r14 | |||
| #define T4 r15 | |||
| #define T5 r16 | |||
| #define T6 r17 | |||
| #define L r18 | |||
| #define T7 r19 | |||
| #define T8 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T9 r27 | |||
| #define T10 r28 | |||
| #define PRE r29 | |||
| #define T12 r30 | |||
| #define T13 r31 | |||
| #include "cgemm_macros_power9.S" | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||
| .equ save_permute_11, 0x0405060714151617 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #ifdef TRMMKERNEL | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #endif | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xscvdpspn alpha_i,vs2 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| xxspltw alpha_i,alpha_i,0 | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| oris T4, T4, save_permute_11@h | |||
| ori T2, T2, perm_const2@l | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| li r0,0 | |||
| li PRE,512 | |||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
| /*negate for this case as we will use addition -1*(a+b) */ | |||
| xvnegsp alpha_r,alpha_r | |||
| xvnegsp alpha_i,alpha_i | |||
| #endif | |||
| mtvsrdd permute_mask,T2,T1 | |||
| mtvsrdd save_permute_1,T3,T4 | |||
| /*mask is reverse permute so we have to make it inner permute */ | |||
| xxpermdi permute_mask, permute_mask, permute_mask,2 | |||
| #include "cgemm_logic_power9.S" | |||
| .L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * Abdelrauf(quickwritereader@gmail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #define alpha_r vs19 | |||
| #define alpha_i vs20 | |||
| #define save_permute_1 vs21 | |||
| #define permute_mask vs22 | |||
| #define o0 0 | |||
| #define T1 r11 | |||
| #define T2 r12 | |||
| #define T3 r14 | |||
| #define T4 r15 | |||
| #define T5 r16 | |||
| #define T6 r17 | |||
| #define L r18 | |||
| #define T7 r19 | |||
| #define T8 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T9 r27 | |||
| #define T10 r28 | |||
| #define PRE r29 | |||
| #define T12 r30 | |||
| #define T13 r31 | |||
| #include "cgemm_macros_power9.S" | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||
| .equ save_permute_11, 0x0405060714151617 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #ifdef TRMMKERNEL | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #endif | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xscvdpspn alpha_i,vs2 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| xxspltw alpha_i,alpha_i,0 | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| oris T4, T4, save_permute_11@h | |||
| ori T2, T2, perm_const2@l | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| li r0,0 | |||
| li PRE,512 | |||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
| /*negate for this case as we will use addition -1*(a+b) */ | |||
| xvnegsp alpha_r,alpha_r | |||
| xvnegsp alpha_i,alpha_i | |||
| #endif | |||
| mtvsrdd permute_mask,T2,T1 | |||
| mtvsrdd save_permute_1,T3,T4 | |||
| /*mask is reverse permute so we have to make it inner permute */ | |||
| xxpermdi permute_mask, permute_mask, permute_mask,2 | |||
| #include "cgemm_logic_power9.S" | |||
| .L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -1,233 +1,233 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||
| { | |||
| __vector float t0; | |||
| __vector float t1; | |||
| __vector float t2; | |||
| __vector float t3; | |||
| __vector float t4; | |||
| __vector float t5; | |||
| __vector float t6; | |||
| __vector float t7; | |||
| __asm__ | |||
| ( | |||
| "xscvdpspn 36, %x[cos] \n\t" // load c to all words | |||
| "xxspltw 36, 36, 0 \n\t" | |||
| "xscvdpspn 37, %x[sin] \n\t" // load s to all words | |||
| "xxspltw 37, 37, 0 \n\t" | |||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
| "lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
| "addi %[x_ptr], %[x_ptr], 64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], 64 \n\t" | |||
| "addic. %[temp_n], %[temp_n], -8 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n\t" | |||
| "one%=: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
| "xvmulsp %x[x2], 49, 36 \n\t" | |||
| "xvmulsp %x[x1], 50, 36 \n\t" | |||
| "xvmulsp %x[x3], 51, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
| "lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
| "xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
| "xvmulsp %x[x5], 49, 37 \n\t" | |||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
| "xvmulsp %x[x6], 50, 37 \n\t" | |||
| "xvmulsp %x[x7], 51, 37 \n\t" | |||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
| "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
| "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
| "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
| "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||
| "addi %[x_ptr], %[x_ptr], 128 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], 128 \n\t" | |||
| "addic. %[temp_n], %[temp_n], -8 \n\t" | |||
| "bgt one%= \n\t" | |||
| "two%=: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
| "xvmulsp %x[x2], 49, 36 \n\t" | |||
| "xvmulsp %x[x1], 50, 36 \n\t" | |||
| "xvmulsp %x[x3], 51, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
| "xvmulsp %x[x5], 49, 37 \n\t" | |||
| "xvmulsp %x[x6], 50, 37 \n\t" | |||
| "xvmulsp %x[x7], 51, 37 \n\t" | |||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
| "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
| "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
| "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
| "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x3], %[i48], %[y_ptr] " | |||
| : | |||
| [mem_x] "+m" (*(float (*)[2*n])x), | |||
| [mem_y] "+m" (*(float (*)[2*n])y), | |||
| [temp_n] "+r" (n), | |||
| [x_ptr] "+&b" (x), | |||
| [y_ptr] "+&b" (y), | |||
| [x0] "=wa" (t0), | |||
| [x1] "=wa" (t2), | |||
| [x2] "=wa" (t1), | |||
| [x3] "=wa" (t3), | |||
| [x4] "=wa" (t4), | |||
| [x5] "=wa" (t5), | |||
| [x6] "=wa" (t6), | |||
| [x7] "=wa" (t7) | |||
| : | |||
| [cos] "f" (c), | |||
| [sin] "f" (s), | |||
| [i16] "b" (16), | |||
| [i32] "b" (32), | |||
| [i48] "b" (48) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51" | |||
| ); | |||
| } | |||
| #endif | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp[2]; | |||
| BLASLONG inc_x2; | |||
| BLASLONG inc_y2; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| crot_kernel_8(n1, x, y, c, s); | |||
| i=n1; | |||
| ix=2*n1; | |||
| } | |||
| #endif | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[ix] ; | |||
| temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||
| y[ix] = c*y[ix] - s*x[ix] ; | |||
| y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||
| x[ix] = temp[0] ; | |||
| x[ix+1] = temp[1] ; | |||
| ix += 2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x ; | |||
| inc_y2 = 2 * inc_y ; | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[iy] ; | |||
| temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||
| y[iy] = c*y[iy] - s*x[ix] ; | |||
| y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||
| x[ix] = temp[0] ; | |||
| x[ix+1] = temp[1] ; | |||
| ix += inc_x2 ; | |||
| iy += inc_y2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||
| { | |||
| __vector float t0; | |||
| __vector float t1; | |||
| __vector float t2; | |||
| __vector float t3; | |||
| __vector float t4; | |||
| __vector float t5; | |||
| __vector float t6; | |||
| __vector float t7; | |||
| __asm__ | |||
| ( | |||
| "xscvdpspn 36, %x[cos] \n\t" // load c to all words | |||
| "xxspltw 36, 36, 0 \n\t" | |||
| "xscvdpspn 37, %x[sin] \n\t" // load s to all words | |||
| "xxspltw 37, 37, 0 \n\t" | |||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
| "lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
| "addi %[x_ptr], %[x_ptr], 64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], 64 \n\t" | |||
| "addic. %[temp_n], %[temp_n], -8 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n\t" | |||
| "one%=: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
| "xvmulsp %x[x2], 49, 36 \n\t" | |||
| "xvmulsp %x[x1], 50, 36 \n\t" | |||
| "xvmulsp %x[x3], 51, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
| "lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
| "xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
| "xvmulsp %x[x5], 49, 37 \n\t" | |||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
| "xvmulsp %x[x6], 50, 37 \n\t" | |||
| "xvmulsp %x[x7], 51, 37 \n\t" | |||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
| "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
| "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
| "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
| "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||
| "addi %[x_ptr], %[x_ptr], 128 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], 128 \n\t" | |||
| "addic. %[temp_n], %[temp_n], -8 \n\t" | |||
| "bgt one%= \n\t" | |||
| "two%=: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
| "xvmulsp %x[x2], 49, 36 \n\t" | |||
| "xvmulsp %x[x1], 50, 36 \n\t" | |||
| "xvmulsp %x[x3], 51, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
| "xvmulsp %x[x5], 49, 37 \n\t" | |||
| "xvmulsp %x[x6], 50, 37 \n\t" | |||
| "xvmulsp %x[x7], 51, 37 \n\t" | |||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
| "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
| "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
| "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
| "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x3], %[i48], %[y_ptr] " | |||
| : | |||
| [mem_x] "+m" (*(float (*)[2*n])x), | |||
| [mem_y] "+m" (*(float (*)[2*n])y), | |||
| [temp_n] "+r" (n), | |||
| [x_ptr] "+&b" (x), | |||
| [y_ptr] "+&b" (y), | |||
| [x0] "=wa" (t0), | |||
| [x1] "=wa" (t2), | |||
| [x2] "=wa" (t1), | |||
| [x3] "=wa" (t3), | |||
| [x4] "=wa" (t4), | |||
| [x5] "=wa" (t5), | |||
| [x6] "=wa" (t6), | |||
| [x7] "=wa" (t7) | |||
| : | |||
| [cos] "f" (c), | |||
| [sin] "f" (s), | |||
| [i16] "b" (16), | |||
| [i32] "b" (32), | |||
| [i48] "b" (48) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51" | |||
| ); | |||
| } | |||
| #endif | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp[2]; | |||
| BLASLONG inc_x2; | |||
| BLASLONG inc_y2; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| crot_kernel_8(n1, x, y, c, s); | |||
| i=n1; | |||
| ix=2*n1; | |||
| } | |||
| #endif | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[ix] ; | |||
| temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||
| y[ix] = c*y[ix] - s*x[ix] ; | |||
| y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||
| x[ix] = temp[0] ; | |||
| x[ix+1] = temp[1] ; | |||
| ix += 2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x ; | |||
| inc_y2 = 2 * inc_y ; | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[iy] ; | |||
| temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||
| y[iy] = c*y[iy] - s*x[ix] ; | |||
| y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||
| x[ix] = temp[0] ; | |||
| x[ix+1] = temp[1] ; | |||
| ix += inc_x2 ; | |||
| iy += inc_y2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -1,249 +1,249 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define ALPHA_SP (296+192)(SP) | |||
| #define FZERO (304+192)(SP) | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #define alpha_r vs18 | |||
| #define o0 0 | |||
| #define T4 r12 | |||
| #define T3 r11 | |||
| #define C4 r14 | |||
| #define o8 r15 | |||
| #define o24 r16 | |||
| #define C2 r17 | |||
| #define L r18 | |||
| #define T1 r19 | |||
| #define C3 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o16 r27 | |||
| #define o32 r28 | |||
| #define o48 r29 | |||
| #define PRE r30 | |||
| #define T2 r31 | |||
| #include "dgemm_macros_power9.S" | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| stfd f1, ALPHA_SP | |||
| stw r0, FZERO | |||
| slwi LDC, LDC, BASE_SHIFT | |||
| #if defined(TRMMKERNEL) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| cmpwi cr0, M, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, N, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, K, 0 | |||
| ble .L999_H1 | |||
| addi T1, SP, 296+192 | |||
| li PRE, 384 | |||
| li o8 , 8 | |||
| li o16, 16 | |||
| li o24, 24 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| lxvdsx alpha_r, 0, T1 | |||
| #include "dgemm_logic_power9.S" | |||
| .L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define ALPHA_SP (296+192)(SP) | |||
| #define FZERO (304+192)(SP) | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #define alpha_r vs18 | |||
| #define o0 0 | |||
| #define T4 r12 | |||
| #define T3 r11 | |||
| #define C4 r14 | |||
| #define o8 r15 | |||
| #define o24 r16 | |||
| #define C2 r17 | |||
| #define L r18 | |||
| #define T1 r19 | |||
| #define C3 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o16 r27 | |||
| #define o32 r28 | |||
| #define o48 r29 | |||
| #define PRE r30 | |||
| #define T2 r31 | |||
| #include "dgemm_macros_power9.S" | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| stfd f1, ALPHA_SP | |||
| stw r0, FZERO | |||
| slwi LDC, LDC, BASE_SHIFT | |||
| #if defined(TRMMKERNEL) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| cmpwi cr0, M, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, N, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, K, 0 | |||
| ble .L999_H1 | |||
| addi T1, SP, 296+192 | |||
| li PRE, 384 | |||
| li o8 , 8 | |||
| li o16, 16 | |||
| li o24, 24 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| lxvdsx alpha_r, 0, T1 | |||
| #include "dgemm_logic_power9.S" | |||
| .L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -1,328 +1,328 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
| #define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code | |||
| #if !defined(USE_MASK_PERMUTATIONS) | |||
| static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ | |||
| __vector float result; | |||
| __asm__ ( | |||
| "vmrgew %0,%1,%2;\n" | |||
| : "=v" (result) | |||
| : "v" (a), | |||
| "v" (b) | |||
| : ); | |||
| return result; | |||
| } | |||
| static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ | |||
| __vector float result; | |||
| __asm__ ( | |||
| "vmrgow %0,%1,%2;\n" | |||
| : "=v" (result) | |||
| : "v" (a), | |||
| "v" (b) | |||
| : ); | |||
| return result; | |||
| } | |||
| #endif | |||
| /** | |||
| * Find maximum index | |||
| * Warning: requirements n>0 and n % 32 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param maxf (out) maximum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| #else | |||
| register __vector unsigned int static_index0 = {2,0,3,1}; | |||
| #endif | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0; | |||
| register __vector unsigned int static_index2=static_index0 +temp1; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| register __vector float quadruple_values={0,0,0,0}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
| register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
| #endif | |||
| for(; i<n; i+=32 ){ | |||
| //absolute temporary complex vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
| register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
| register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| #else | |||
| register __vector float t1=mvec_mergee(v0,v1); | |||
| register __vector float ti=mvec_mergeo(v0,v1); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2= mvec_mergee(v2,v3); | |||
| register __vector float ti2=mvec_mergeo(v2,v3); | |||
| v1=t2+ti2; | |||
| t1=mvec_mergee(v4,v5); | |||
| ti=mvec_mergeo(v4,v5); | |||
| v2=t1+ti; //sum | |||
| t2=mvec_mergee(v6,v7); | |||
| ti2=mvec_mergeo(v6,v7); | |||
| v3=t2+ti2; | |||
| #endif | |||
| // now we have 16 summed elements . lets compare them | |||
| v_ptrx+=8; | |||
| register __vector bool int r1=vec_cmpgt(v1,v0); | |||
| register __vector bool int r2=vec_cmpgt(v3,v2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for first 16 values | |||
| r1=vec_cmpgt(v1,v0); | |||
| register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| //absolute temporary complex vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| t1=vec_perm(v0,v1,real_pack_mask); | |||
| ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=vec_perm(v2,v3,real_pack_mask); | |||
| ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| #else | |||
| t1=mvec_mergee(v0,v1); | |||
| ti=mvec_mergeo(v0,v1); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=mvec_mergee(v2,v3); | |||
| ti2=mvec_mergeo(v2,v3); | |||
| v1=t2+ti2; | |||
| t1=mvec_mergee(v4,v5); | |||
| ti=mvec_mergeo(v4,v5); | |||
| v2=t1+ti; //sum | |||
| t2=mvec_mergee(v6,v7); | |||
| ti2=mvec_mergeo(v6,v7); | |||
| v3=t2+ti2; | |||
| #endif | |||
| // now we have 16 summed elements {from 16 to 31} . lets compare them | |||
| v_ptrx+=8; | |||
| r1=vec_cmpgt(v1,v0); | |||
| r2=vec_cmpgt(v3,v2); | |||
| ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for the second 16 values | |||
| r1=vec_cmpgt(v1,v0); | |||
| register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| indv0+=temp1; //make index from 16->31 | |||
| //find final quadruple from 32 elements | |||
| r2=vec_cmpgt(vv0,vf0); | |||
| ind2 = vec_sel( indf0,indv0,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //get asbolute index | |||
| ind2+=temp0; | |||
| //compare with old quadruple and update | |||
| r1=vec_cmpgt(vv0,quadruple_values); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
| temp0+=temp_add; | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the maximum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2>a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4>a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *maxf=a1; | |||
| }else if(a3>a1){ | |||
| index=i1; | |||
| *maxf=a3; | |||
| }else{ | |||
| *maxf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i = 0; | |||
| BLASLONG ix = 0; | |||
| FLOAT maxf = 0; | |||
| BLASLONG max = 0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(max); | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| max = ciamax_kernel_32(n1, x, &maxf); | |||
| i = n1; | |||
| ix = n1 << 1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) > maxf ) | |||
| { | |||
| max = i; | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| maxf = CABS1(x,0); | |||
| ix += inc_x2; | |||
| i++; | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) > maxf ) | |||
| { | |||
| max = i; | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
| #define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code | |||
| #if !defined(USE_MASK_PERMUTATIONS) | |||
| static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ | |||
| __vector float result; | |||
| __asm__ ( | |||
| "vmrgew %0,%1,%2;\n" | |||
| : "=v" (result) | |||
| : "v" (a), | |||
| "v" (b) | |||
| : ); | |||
| return result; | |||
| } | |||
| static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ | |||
| __vector float result; | |||
| __asm__ ( | |||
| "vmrgow %0,%1,%2;\n" | |||
| : "=v" (result) | |||
| : "v" (a), | |||
| "v" (b) | |||
| : ); | |||
| return result; | |||
| } | |||
| #endif | |||
| /** | |||
| * Find maximum index | |||
| * Warning: requirements n>0 and n % 32 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param maxf (out) maximum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| #else | |||
| register __vector unsigned int static_index0 = {2,0,3,1}; | |||
| #endif | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0; | |||
| register __vector unsigned int static_index2=static_index0 +temp1; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| register __vector float quadruple_values={0,0,0,0}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
| register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
| #endif | |||
| for(; i<n; i+=32 ){ | |||
| //absolute temporary complex vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
| register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
| register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| #else | |||
| register __vector float t1=mvec_mergee(v0,v1); | |||
| register __vector float ti=mvec_mergeo(v0,v1); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2= mvec_mergee(v2,v3); | |||
| register __vector float ti2=mvec_mergeo(v2,v3); | |||
| v1=t2+ti2; | |||
| t1=mvec_mergee(v4,v5); | |||
| ti=mvec_mergeo(v4,v5); | |||
| v2=t1+ti; //sum | |||
| t2=mvec_mergee(v6,v7); | |||
| ti2=mvec_mergeo(v6,v7); | |||
| v3=t2+ti2; | |||
| #endif | |||
| // now we have 16 summed elements . lets compare them | |||
| v_ptrx+=8; | |||
| register __vector bool int r1=vec_cmpgt(v1,v0); | |||
| register __vector bool int r2=vec_cmpgt(v3,v2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for first 16 values | |||
| r1=vec_cmpgt(v1,v0); | |||
| register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| //absolute temporary complex vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| t1=vec_perm(v0,v1,real_pack_mask); | |||
| ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=vec_perm(v2,v3,real_pack_mask); | |||
| ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| #else | |||
| t1=mvec_mergee(v0,v1); | |||
| ti=mvec_mergeo(v0,v1); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=mvec_mergee(v2,v3); | |||
| ti2=mvec_mergeo(v2,v3); | |||
| v1=t2+ti2; | |||
| t1=mvec_mergee(v4,v5); | |||
| ti=mvec_mergeo(v4,v5); | |||
| v2=t1+ti; //sum | |||
| t2=mvec_mergee(v6,v7); | |||
| ti2=mvec_mergeo(v6,v7); | |||
| v3=t2+ti2; | |||
| #endif | |||
| // now we have 16 summed elements {from 16 to 31} . lets compare them | |||
| v_ptrx+=8; | |||
| r1=vec_cmpgt(v1,v0); | |||
| r2=vec_cmpgt(v3,v2); | |||
| ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for the second 16 values | |||
| r1=vec_cmpgt(v1,v0); | |||
| register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| indv0+=temp1; //make index from 16->31 | |||
| //find final quadruple from 32 elements | |||
| r2=vec_cmpgt(vv0,vf0); | |||
| ind2 = vec_sel( indf0,indv0,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //get asbolute index | |||
| ind2+=temp0; | |||
| //compare with old quadruple and update | |||
| r1=vec_cmpgt(vv0,quadruple_values); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
| temp0+=temp_add; | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the maximum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2>a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4>a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *maxf=a1; | |||
| }else if(a3>a1){ | |||
| index=i1; | |||
| *maxf=a3; | |||
| }else{ | |||
| *maxf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i = 0; | |||
| BLASLONG ix = 0; | |||
| FLOAT maxf = 0; | |||
| BLASLONG max = 0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(max); | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| max = ciamax_kernel_32(n1, x, &maxf); | |||
| i = n1; | |||
| ix = n1 << 1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) > maxf ) | |||
| { | |||
| max = i; | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| maxf = CABS1(x,0); | |||
| ix += inc_x2; | |||
| i++; | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) > maxf ) | |||
| { | |||
| max = i; | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } | |||
| } | |||
| @@ -1,266 +1,266 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
| /** | |||
| * Find minimum index | |||
| * Warning: requirements n>0 and n % 32 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param minf (out) minimum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| float first_min=CABS1(x,0); | |||
| register __vector float quadruple_values={first_min,first_min,first_min,first_min}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
| register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
| for(; i<n; i+=32){ | |||
| //absolute temporary complex vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
| register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
| register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| // now we have 16 summed elements . lets compare them | |||
| v_ptrx+=8; | |||
| register __vector bool int r1=vec_cmpgt(v0,v1); | |||
| register __vector bool int r2=vec_cmpgt(v2,v3); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for first 16 values | |||
| r1=vec_cmpgt(v0,v1); | |||
| register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| //absolute temporary complex vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| t1=vec_perm(v0,v1,real_pack_mask); | |||
| ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=vec_perm(v2,v3,real_pack_mask); | |||
| ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| // now we have 16 summed elements {from 16 to 31} . lets compare them | |||
| v_ptrx+=8; | |||
| r1=vec_cmpgt(v0,v1); | |||
| r2=vec_cmpgt(v2,v3); | |||
| ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for the second 16 values | |||
| r1=vec_cmpgt(v0,v1); | |||
| register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| indv0+=temp1; //make index from 16->31 | |||
| //find final quadruple from 32 elements | |||
| r2=vec_cmpgt(vf0,vv0); | |||
| ind2 = vec_sel( indf0,indv0,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //get asbolute index | |||
| ind2+=temp0; | |||
| //compare with old quadruple and update | |||
| r1=vec_cmpgt(quadruple_values,vv0); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
| temp0+=temp_add; | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the minimum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2<a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4<a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *minf=a1; | |||
| }else if(a3<a1){ | |||
| index=i1; | |||
| *minf=a3; | |||
| }else{ | |||
| *minf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0; | |||
| FLOAT minf; | |||
| BLASLONG min=0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(min); | |||
| if (inc_x == 1) { | |||
| minf = CABS1(x,0); //index will not be incremented | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| min = ciamin_kernel_32(n1, x, &minf); | |||
| i = n1; | |||
| ix = n1 << 1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) < minf ) | |||
| { | |||
| min = i; | |||
| minf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| minf = CABS1(x,0); | |||
| ix += inc_x2; | |||
| i++; | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) < minf ) | |||
| { | |||
| min = i; | |||
| minf = CABS1(x,ix); | |||
| } | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
| /** | |||
| * Find minimum index | |||
| * Warning: requirements n>0 and n % 32 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param minf (out) minimum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| float first_min=CABS1(x,0); | |||
| register __vector float quadruple_values={first_min,first_min,first_min,first_min}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
| register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
| for(; i<n; i+=32){ | |||
| //absolute temporary complex vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
| register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
| register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| // now we have 16 summed elements . lets compare them | |||
| v_ptrx+=8; | |||
| register __vector bool int r1=vec_cmpgt(v0,v1); | |||
| register __vector bool int r2=vec_cmpgt(v2,v3); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for first 16 values | |||
| r1=vec_cmpgt(v0,v1); | |||
| register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| //absolute temporary complex vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| t1=vec_perm(v0,v1,real_pack_mask); | |||
| ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=vec_perm(v2,v3,real_pack_mask); | |||
| ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| // now we have 16 summed elements {from 16 to 31} . lets compare them | |||
| v_ptrx+=8; | |||
| r1=vec_cmpgt(v0,v1); | |||
| r2=vec_cmpgt(v2,v3); | |||
| ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for the second 16 values | |||
| r1=vec_cmpgt(v0,v1); | |||
| register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| indv0+=temp1; //make index from 16->31 | |||
| //find final quadruple from 32 elements | |||
| r2=vec_cmpgt(vf0,vv0); | |||
| ind2 = vec_sel( indf0,indv0,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //get asbolute index | |||
| ind2+=temp0; | |||
| //compare with old quadruple and update | |||
| r1=vec_cmpgt(quadruple_values,vv0); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
| temp0+=temp_add; | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the minimum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2<a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4<a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *minf=a1; | |||
| }else if(a3<a1){ | |||
| index=i1; | |||
| *minf=a3; | |||
| }else{ | |||
| *minf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0; | |||
| FLOAT minf; | |||
| BLASLONG min=0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(min); | |||
| if (inc_x == 1) { | |||
| minf = CABS1(x,0); //index will not be incremented | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| min = ciamin_kernel_32(n1, x, &minf); | |||
| i = n1; | |||
| ix = n1 << 1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) < minf ) | |||
| { | |||
| min = i; | |||
| minf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| minf = CABS1(x,0); | |||
| ix += inc_x2; | |||
| i++; | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) < minf ) | |||
| { | |||
| min = i; | |||
| minf = CABS1(x,ix); | |||
| } | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } | |||
| } | |||
| @@ -1,288 +1,288 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| /** | |||
| * Find maximum index | |||
| * Warning: requirements n>0 and n % 64 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param maxf (out) maximum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| register __vector float quadruple_values={0,0,0,0}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| for(; i<n; i+=64){ | |||
| //absolute temporary vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| register __vector bool int r1=vec_cmpgt(v1,v0); | |||
| register __vector bool int r2=vec_cmpgt(v3,v2); | |||
| register __vector bool int r3=vec_cmpgt(v5,v4); | |||
| register __vector bool int r4=vec_cmpgt(v7,v6); | |||
| //select | |||
| register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vf1= vec_sel(v2,v3,r2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vf1,vf0); | |||
| r2=vec_cmpgt(v1,v0); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_first= vec_sel(ind0_first,ind1,r1); | |||
| vf0= vec_sel(vf0,vf1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vf1= vec_sel(v0,v1,r2); | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the first 32 values | |||
| r1=vec_cmpgt(vf1,vf0); | |||
| ind0_first = vec_sel(ind0_first,ind2,r1); | |||
| vf0= vec_sel(vf0,vf1,r1); | |||
| ind0_first+=temp0; //get absolute index | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| //second part of 32 | |||
| // absolute temporary vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| r1=vec_cmpgt(v1,v0); | |||
| r2=vec_cmpgt(v3,v2); | |||
| r3=vec_cmpgt(v5,v4); | |||
| r4=vec_cmpgt(v7,v6); | |||
| //select | |||
| register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vv1= vec_sel(v2,v3,r2); | |||
| ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vv1,vv0); | |||
| r2=vec_cmpgt(v1,v0); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_second= vec_sel(ind0_second,ind1,r1); | |||
| vv0= vec_sel(vv0,vv1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vv1= vec_sel(v0,v1,r2) ; | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the second 32 values | |||
| r1=vec_cmpgt(vv1,vv0); | |||
| ind0_second = vec_sel(ind0_second,ind2,r1); | |||
| vv0= vec_sel(vv0,vv1,r1); | |||
| ind0_second+=temp0; //get absolute index | |||
| //find final quadruple from 64 elements | |||
| r2=vec_cmpgt(vv0,vf0); | |||
| ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //compare with old quadruple and update | |||
| r3=vec_cmpgt(vv0,quadruple_values); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the maximum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2>a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4>a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *maxf=a1; | |||
| }else if(a3>a1){ | |||
| index=i1; | |||
| *maxf=a3; | |||
| }else{ | |||
| *maxf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| FLOAT maxf = 0.0; | |||
| BLASLONG max = 0; | |||
| if (n <= 0 || inc_x <= 0) return (max); | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -64; | |||
| if (n1 > 0) { | |||
| max = siamax_kernel_64(n1, x, &maxf); | |||
| i = n1; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = i; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| if (ABS(x[i + inc_x]) > maxf) { | |||
| max = j + 1; | |||
| maxf = ABS(x[i + inc_x]); | |||
| } | |||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||
| max = j + 2; | |||
| maxf = ABS(x[i + 2 * inc_x]); | |||
| } | |||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||
| max = j + 3; | |||
| maxf = ABS(x[i + 3 * inc_x]); | |||
| } | |||
| i += inc_x * 4; | |||
| j += 4; | |||
| } | |||
| while (j < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| return (max + 1); | |||
| } | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| /** | |||
| * Find maximum index | |||
| * Warning: requirements n>0 and n % 64 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param maxf (out) maximum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| register __vector float quadruple_values={0,0,0,0}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| for(; i<n; i+=64){ | |||
| //absolute temporary vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| register __vector bool int r1=vec_cmpgt(v1,v0); | |||
| register __vector bool int r2=vec_cmpgt(v3,v2); | |||
| register __vector bool int r3=vec_cmpgt(v5,v4); | |||
| register __vector bool int r4=vec_cmpgt(v7,v6); | |||
| //select | |||
| register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vf1= vec_sel(v2,v3,r2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vf1,vf0); | |||
| r2=vec_cmpgt(v1,v0); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_first= vec_sel(ind0_first,ind1,r1); | |||
| vf0= vec_sel(vf0,vf1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vf1= vec_sel(v0,v1,r2); | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the first 32 values | |||
| r1=vec_cmpgt(vf1,vf0); | |||
| ind0_first = vec_sel(ind0_first,ind2,r1); | |||
| vf0= vec_sel(vf0,vf1,r1); | |||
| ind0_first+=temp0; //get absolute index | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| //second part of 32 | |||
| // absolute temporary vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| r1=vec_cmpgt(v1,v0); | |||
| r2=vec_cmpgt(v3,v2); | |||
| r3=vec_cmpgt(v5,v4); | |||
| r4=vec_cmpgt(v7,v6); | |||
| //select | |||
| register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vv1= vec_sel(v2,v3,r2); | |||
| ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vv1,vv0); | |||
| r2=vec_cmpgt(v1,v0); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_second= vec_sel(ind0_second,ind1,r1); | |||
| vv0= vec_sel(vv0,vv1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vv1= vec_sel(v0,v1,r2) ; | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the second 32 values | |||
| r1=vec_cmpgt(vv1,vv0); | |||
| ind0_second = vec_sel(ind0_second,ind2,r1); | |||
| vv0= vec_sel(vv0,vv1,r1); | |||
| ind0_second+=temp0; //get absolute index | |||
| //find final quadruple from 64 elements | |||
| r2=vec_cmpgt(vv0,vf0); | |||
| ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //compare with old quadruple and update | |||
| r3=vec_cmpgt(vv0,quadruple_values); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the maximum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2>a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4>a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *maxf=a1; | |||
| }else if(a3>a1){ | |||
| index=i1; | |||
| *maxf=a3; | |||
| }else{ | |||
| *maxf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| FLOAT maxf = 0.0; | |||
| BLASLONG max = 0; | |||
| if (n <= 0 || inc_x <= 0) return (max); | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -64; | |||
| if (n1 > 0) { | |||
| max = siamax_kernel_64(n1, x, &maxf); | |||
| i = n1; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = i; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| if (ABS(x[i + inc_x]) > maxf) { | |||
| max = j + 1; | |||
| maxf = ABS(x[i + inc_x]); | |||
| } | |||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||
| max = j + 2; | |||
| maxf = ABS(x[i + 2 * inc_x]); | |||
| } | |||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||
| max = j + 3; | |||
| maxf = ABS(x[i + 3 * inc_x]); | |||
| } | |||
| i += inc_x * 4; | |||
| j += 4; | |||
| } | |||
| while (j < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| return (max + 1); | |||
| } | |||
| } | |||
| @@ -1,288 +1,288 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| /** | |||
| * Find minimum index | |||
| * Warning: requirements n>0 and n % 64 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param minf (out) minimum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| register __vector float quadruple_values=vec_abs(v_ptrx[0]); | |||
| for(; i<n; i+=64){ | |||
| //absolute temporary vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| register __vector bool int r1=vec_cmpgt(v0,v1); | |||
| register __vector bool int r2=vec_cmpgt(v2,v3); | |||
| register __vector bool int r3=vec_cmpgt(v4,v5); | |||
| register __vector bool int r4=vec_cmpgt(v6,v7); | |||
| //select | |||
| register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vf1= vec_sel(v2,v3,r2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vf0,vf1); | |||
| r2=vec_cmpgt(v0,v1); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_first= vec_sel(ind0_first,ind1,r1); | |||
| vf0= vec_sel(vf0,vf1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vf1= vec_sel(v0,v1,r2); | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the first 32 values | |||
| r1=vec_cmpgt(vf0,vf1); | |||
| ind0_first = vec_sel(ind0_first,ind2,r1); | |||
| vf0= vec_sel(vf0,vf1,r1); | |||
| ind0_first+=temp0; //get absolute index | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| //second part of 32 | |||
| // absolute temporary vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| r1=vec_cmpgt(v0,v1); | |||
| r2=vec_cmpgt(v2,v3); | |||
| r3=vec_cmpgt(v4,v5); | |||
| r4=vec_cmpgt(v6,v7); | |||
| //select | |||
| register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vv1= vec_sel(v2,v3,r2); | |||
| ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vv0,vv1); | |||
| r2=vec_cmpgt(v0,v1); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_second= vec_sel(ind0_second,ind1,r1); | |||
| vv0= vec_sel(vv0,vv1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vv1= vec_sel(v0,v1,r2) ; | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the second 32 values | |||
| r1=vec_cmpgt(vv0,vv1); | |||
| ind0_second = vec_sel(ind0_second,ind2,r1); | |||
| vv0= vec_sel(vv0,vv1,r1); | |||
| ind0_second+=temp0; //get absolute index | |||
| //find final quadruple from 64 elements | |||
| r2=vec_cmpgt(vf0,vv0); | |||
| ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //compare with old quadruple and update | |||
| r3=vec_cmpgt( quadruple_values,vv0); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the minimum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2<a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4<a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *minf=a1; | |||
| }else if(a3<a1){ | |||
| index=i1; | |||
| *minf=a3; | |||
| }else{ | |||
| *minf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| BLASLONG min = 0; | |||
| FLOAT minf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return (min); | |||
| minf = ABS(x[0]); //index's not incremented | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -64; | |||
| if (n1 > 0) { | |||
| min = siamin_kernel_64(n1, x, &minf); | |||
| i = n1; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = i; | |||
| minf = ABS(x[i]); | |||
| } | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = j; | |||
| minf = ABS(x[i]); | |||
| } | |||
| if (ABS(x[i + inc_x]) < minf) { | |||
| min = j + 1; | |||
| minf = ABS(x[i + inc_x]); | |||
| } | |||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||
| min = j + 2; | |||
| minf = ABS(x[i + 2 * inc_x]); | |||
| } | |||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||
| min = j + 3; | |||
| minf = ABS(x[i + 3 * inc_x]); | |||
| } | |||
| i += inc_x * 4; | |||
| j += 4; | |||
| } | |||
| while (j < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = j; | |||
| minf = ABS(x[i]); | |||
| } | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| return (min + 1); | |||
| } | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| /** | |||
| * Find minimum index | |||
| * Warning: requirements n>0 and n % 64 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param minf (out) minimum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| register __vector float quadruple_values=vec_abs(v_ptrx[0]); | |||
| for(; i<n; i+=64){ | |||
| //absolute temporary vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| register __vector bool int r1=vec_cmpgt(v0,v1); | |||
| register __vector bool int r2=vec_cmpgt(v2,v3); | |||
| register __vector bool int r3=vec_cmpgt(v4,v5); | |||
| register __vector bool int r4=vec_cmpgt(v6,v7); | |||
| //select | |||
| register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vf1= vec_sel(v2,v3,r2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vf0,vf1); | |||
| r2=vec_cmpgt(v0,v1); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_first= vec_sel(ind0_first,ind1,r1); | |||
| vf0= vec_sel(vf0,vf1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vf1= vec_sel(v0,v1,r2); | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the first 32 values | |||
| r1=vec_cmpgt(vf0,vf1); | |||
| ind0_first = vec_sel(ind0_first,ind2,r1); | |||
| vf0= vec_sel(vf0,vf1,r1); | |||
| ind0_first+=temp0; //get absolute index | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| //second part of 32 | |||
| // absolute temporary vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| r1=vec_cmpgt(v0,v1); | |||
| r2=vec_cmpgt(v2,v3); | |||
| r3=vec_cmpgt(v4,v5); | |||
| r4=vec_cmpgt(v6,v7); | |||
| //select | |||
| register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vv1= vec_sel(v2,v3,r2); | |||
| ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vv0,vv1); | |||
| r2=vec_cmpgt(v0,v1); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_second= vec_sel(ind0_second,ind1,r1); | |||
| vv0= vec_sel(vv0,vv1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vv1= vec_sel(v0,v1,r2) ; | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the second 32 values | |||
| r1=vec_cmpgt(vv0,vv1); | |||
| ind0_second = vec_sel(ind0_second,ind2,r1); | |||
| vv0= vec_sel(vv0,vv1,r1); | |||
| ind0_second+=temp0; //get absolute index | |||
| //find final quadruple from 64 elements | |||
| r2=vec_cmpgt(vf0,vv0); | |||
| ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //compare with old quadruple and update | |||
| r3=vec_cmpgt( quadruple_values,vv0); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the minimum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2<a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4<a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *minf=a1; | |||
| }else if(a3<a1){ | |||
| index=i1; | |||
| *minf=a3; | |||
| }else{ | |||
| *minf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| BLASLONG min = 0; | |||
| FLOAT minf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return (min); | |||
| minf = ABS(x[0]); //index's not incremented | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -64; | |||
| if (n1 > 0) { | |||
| min = siamin_kernel_64(n1, x, &minf); | |||
| i = n1; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = i; | |||
| minf = ABS(x[i]); | |||
| } | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = j; | |||
| minf = ABS(x[i]); | |||
| } | |||
| if (ABS(x[i + inc_x]) < minf) { | |||
| min = j + 1; | |||
| minf = ABS(x[i + inc_x]); | |||
| } | |||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||
| min = j + 2; | |||
| minf = ABS(x[i + 2 * inc_x]); | |||
| } | |||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||
| min = j + 3; | |||
| minf = ABS(x[i + 3 * inc_x]); | |||
| } | |||
| i += inc_x * 4; | |||
| j += 4; | |||
| } | |||
| while (j < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = j; | |||
| minf = ABS(x[i]); | |||
| } | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| return (min + 1); | |||
| } | |||
| } | |||
| @@ -1,272 +1,272 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #define alpha_r vs20 | |||
| #define save_permute_1 vs21 | |||
| #define save_permute_2 vs22 | |||
| #define permute_mask vs23 | |||
| #define o0 0 | |||
| #define T1 r11 | |||
| #define T2 r12 | |||
| #define T3 r14 | |||
| #define T4 r15 | |||
| #define T5 r16 | |||
| #define T6 r17 | |||
| #define L r18 | |||
| #define T7 r19 | |||
| #define T8 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T9 r27 | |||
| #define T10 r28 | |||
| #define T11 r29 | |||
| #define T12 r30 | |||
| #define T13 r31 | |||
| #include "sgemm_macros_power9.S" | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_11, 0x1415161718191a1b | |||
| .equ save_permute_12, 0x0405060708090a0b | |||
| .equ save_permute_21, 0x101112131c1d1e1f | |||
| .equ save_permute_22, 0x000102030c0d0e0f | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| #if defined(TRMMKERNEL) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| slwi LDC, LDC, 2 | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| lis T5, save_permute_22@highest | |||
| lis T6, save_permute_21@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| ori T5, T5, save_permute_22@higher | |||
| ori T6, T6, save_permute_21@higher | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| rldicr T5, T5, 32, 31 | |||
| rldicr T6, T6, 32, 31 | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| oris T4, T4, save_permute_11@h | |||
| oris T5, T5, save_permute_22@h | |||
| oris T6, T6, save_permute_21@h | |||
| ori T2, T2, perm_const2@l | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| ori T5, T5, save_permute_22@l | |||
| ori T6, T6, save_permute_21@l | |||
| li r0,0 | |||
| mtvsrdd permute_mask,T2,T1 | |||
| mtvsrdd save_permute_1,T3,T4 | |||
| mtvsrdd save_permute_2,T5,T6 | |||
| #include "sgemm_logic_power9.S" | |||
| .L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #define alpha_r vs20 | |||
| #define save_permute_1 vs21 | |||
| #define save_permute_2 vs22 | |||
| #define permute_mask vs23 | |||
| #define o0 0 | |||
| #define T1 r11 | |||
| #define T2 r12 | |||
| #define T3 r14 | |||
| #define T4 r15 | |||
| #define T5 r16 | |||
| #define T6 r17 | |||
| #define L r18 | |||
| #define T7 r19 | |||
| #define T8 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T9 r27 | |||
| #define T10 r28 | |||
| #define T11 r29 | |||
| #define T12 r30 | |||
| #define T13 r31 | |||
| #include "sgemm_macros_power9.S" | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_11, 0x1415161718191a1b | |||
| .equ save_permute_12, 0x0405060708090a0b | |||
| .equ save_permute_21, 0x101112131c1d1e1f | |||
| .equ save_permute_22, 0x000102030c0d0e0f | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| #if defined(TRMMKERNEL) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| slwi LDC, LDC, 2 | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| lis T5, save_permute_22@highest | |||
| lis T6, save_permute_21@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| ori T5, T5, save_permute_22@higher | |||
| ori T6, T6, save_permute_21@higher | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| rldicr T5, T5, 32, 31 | |||
| rldicr T6, T6, 32, 31 | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| oris T4, T4, save_permute_11@h | |||
| oris T5, T5, save_permute_22@h | |||
| oris T6, T6, save_permute_21@h | |||
| ori T2, T2, perm_const2@l | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| ori T5, T5, save_permute_22@l | |||
| ori T6, T6, save_permute_21@l | |||
| li r0,0 | |||
| mtvsrdd permute_mask,T2,T1 | |||
| mtvsrdd save_permute_1,T3,T4 | |||
| mtvsrdd save_permute_2,T5,T6 | |||
| #include "sgemm_logic_power9.S" | |||
| .L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -1,470 +1,470 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_n.c" | |||
| #else | |||
| #include "common.h" | |||
| #define NBMAX 4096 | |||
| static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; | |||
| FLOAT x0,x1,x2,x3,x4,x5,x6,x7; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| b0 = a0 + lda4 ; | |||
| b1 = a1 + lda4 ; | |||
| b2 = a2 + lda4 ; | |||
| b3 = a3 + lda4 ; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| x4 = xo[4] * *alpha; | |||
| x5 = xo[5] * *alpha; | |||
| x6 = xo[6] * *alpha; | |||
| x7 = xo[7] * *alpha; | |||
| __vector float* va0 = (__vector float*)a0; | |||
| __vector float* va1 = (__vector float*)a1; | |||
| __vector float* va2 = (__vector float*)a2; | |||
| __vector float* va3 = (__vector float*)a3; | |||
| __vector float* vb0 = (__vector float*)b0; | |||
| __vector float* vb1 = (__vector float*)b1; | |||
| __vector float* vb2 = (__vector float*)b2; | |||
| __vector float* vb3 = (__vector float*)b3; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float v_x4 = {x4,x4,x4,x4}; | |||
| __vector float v_x5 = {x5,x5,x5,x5}; | |||
| __vector float v_x6 = {x6,x6,x6,x6}; | |||
| __vector float v_x7 = {x7,x7,x7,x7}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| for ( i=0; i< n/4; i++) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; | |||
| v_y[i] =vy; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0,x1,x2,x3; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| __vector float* va2 = (__vector float*)ap[2]; | |||
| __vector float* va3 = (__vector float*)ap[3]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| v_y[i] =vy; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0,x1; | |||
| x0 = x[0] * *alpha; | |||
| x1 = x[1] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0 ; | |||
| x0 = x[0] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] ; | |||
| } | |||
| } | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| { | |||
| BLASLONG i; | |||
| for ( i=0; i<n; i++ ){ | |||
| *dest += *src; | |||
| src++; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| FLOAT *ap[4]; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| BLASLONG lda4 = lda << 2; | |||
| BLASLONG lda8 = lda << 3; | |||
| FLOAT xbuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *ybuffer; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| ybuffer = buffer; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n >> 3 ; | |||
| n2 = n & 7 ; | |||
| } | |||
| else | |||
| { | |||
| n1 = n >> 2 ; | |||
| n2 = n & 3 ; | |||
| } | |||
| m3 = m & 3 ; | |||
| m1 = m & -4 ; | |||
| m2 = (m & (NBMAX-1)) - m3 ; | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if ( inc_y != 1 ) | |||
| memset(ybuffer,0,NB*4); | |||
| else | |||
| ybuffer = y_ptr; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); | |||
| ap[0] += lda8; | |||
| ap[1] += lda8; | |||
| ap[2] += lda8; | |||
| ap[3] += lda8; | |||
| a_ptr += lda8; | |||
| x_ptr += 8; | |||
| } | |||
| if ( n2 & 4 ) | |||
| { | |||
| sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| x_ptr += 4; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda*2; | |||
| x_ptr += 2; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| x_ptr += 1; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[1] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[3] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| } | |||
| for( i = 0; i < n2 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| a += NB; | |||
| if ( inc_y != 1 ) | |||
| { | |||
| add_y(NB,ybuffer,y_ptr,inc_y); | |||
| y_ptr += NB * inc_y; | |||
| } | |||
| else | |||
| y_ptr += NB ; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| if ( m3 == 3 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| if ( lda == 3 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < ( n & -4 ); i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
| temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
| temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
| temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
| a_ptr += 12; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += 3; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp2; | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| if ( lda == 2 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4) ; i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
| temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += 2; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| return(0); | |||
| } | |||
| if ( m3 == 1 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp = 0.0; | |||
| if ( lda == 1 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4); i+=4 ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[0] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp; | |||
| return(0); | |||
| } | |||
| return(0); | |||
| } | |||
| #endif | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_n.c" | |||
| #else | |||
| #include "common.h" | |||
| #define NBMAX 4096 | |||
| static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; | |||
| FLOAT x0,x1,x2,x3,x4,x5,x6,x7; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| b0 = a0 + lda4 ; | |||
| b1 = a1 + lda4 ; | |||
| b2 = a2 + lda4 ; | |||
| b3 = a3 + lda4 ; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| x4 = xo[4] * *alpha; | |||
| x5 = xo[5] * *alpha; | |||
| x6 = xo[6] * *alpha; | |||
| x7 = xo[7] * *alpha; | |||
| __vector float* va0 = (__vector float*)a0; | |||
| __vector float* va1 = (__vector float*)a1; | |||
| __vector float* va2 = (__vector float*)a2; | |||
| __vector float* va3 = (__vector float*)a3; | |||
| __vector float* vb0 = (__vector float*)b0; | |||
| __vector float* vb1 = (__vector float*)b1; | |||
| __vector float* vb2 = (__vector float*)b2; | |||
| __vector float* vb3 = (__vector float*)b3; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float v_x4 = {x4,x4,x4,x4}; | |||
| __vector float v_x5 = {x5,x5,x5,x5}; | |||
| __vector float v_x6 = {x6,x6,x6,x6}; | |||
| __vector float v_x7 = {x7,x7,x7,x7}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| for ( i=0; i< n/4; i++) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; | |||
| v_y[i] =vy; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0,x1,x2,x3; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| __vector float* va2 = (__vector float*)ap[2]; | |||
| __vector float* va3 = (__vector float*)ap[3]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| v_y[i] =vy; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0,x1; | |||
| x0 = x[0] * *alpha; | |||
| x1 = x[1] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0 ; | |||
| x0 = x[0] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] ; | |||
| } | |||
| } | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| { | |||
| BLASLONG i; | |||
| for ( i=0; i<n; i++ ){ | |||
| *dest += *src; | |||
| src++; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| FLOAT *ap[4]; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| BLASLONG lda4 = lda << 2; | |||
| BLASLONG lda8 = lda << 3; | |||
| FLOAT xbuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *ybuffer; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| ybuffer = buffer; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n >> 3 ; | |||
| n2 = n & 7 ; | |||
| } | |||
| else | |||
| { | |||
| n1 = n >> 2 ; | |||
| n2 = n & 3 ; | |||
| } | |||
| m3 = m & 3 ; | |||
| m1 = m & -4 ; | |||
| m2 = (m & (NBMAX-1)) - m3 ; | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if ( inc_y != 1 ) | |||
| memset(ybuffer,0,NB*4); | |||
| else | |||
| ybuffer = y_ptr; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); | |||
| ap[0] += lda8; | |||
| ap[1] += lda8; | |||
| ap[2] += lda8; | |||
| ap[3] += lda8; | |||
| a_ptr += lda8; | |||
| x_ptr += 8; | |||
| } | |||
| if ( n2 & 4 ) | |||
| { | |||
| sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| x_ptr += 4; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda*2; | |||
| x_ptr += 2; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| x_ptr += 1; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[1] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[3] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| } | |||
| for( i = 0; i < n2 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| a += NB; | |||
| if ( inc_y != 1 ) | |||
| { | |||
| add_y(NB,ybuffer,y_ptr,inc_y); | |||
| y_ptr += NB * inc_y; | |||
| } | |||
| else | |||
| y_ptr += NB ; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| if ( m3 == 3 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| if ( lda == 3 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < ( n & -4 ); i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
| temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
| temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
| temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
| a_ptr += 12; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += 3; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp2; | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| if ( lda == 2 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4) ; i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
| temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += 2; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| return(0); | |||
| } | |||
| if ( m3 == 1 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp = 0.0; | |||
| if ( lda == 1 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4); i+=4 ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[0] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp; | |||
| return(0); | |||
| } | |||
| return(0); | |||
| } | |||
| #endif | |||
| @@ -1,484 +1,484 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_t.c" | |||
| #else | |||
| #include "common.h" | |||
| #define NBMAX 2048 | |||
| #include <altivec.h> | |||
| static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||
| __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| register __vector float temp4 = {0,0,0,0}; | |||
| register __vector float temp5 = {0,0,0,0}; | |||
| register __vector float temp6 = {0,0,0,0}; | |||
| register __vector float temp7 = {0,0,0,0}; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| a4 = a3 + lda; | |||
| a5 = a4 + lda; | |||
| a6 = a5 + lda; | |||
| a7 = a6 + lda; | |||
| va0 = (__vector float*) a0; | |||
| va1 = (__vector float*) a1; | |||
| va2 = (__vector float*) a2; | |||
| va3 = (__vector float*) a3; | |||
| va4 = (__vector float*) a4; | |||
| va5 = (__vector float*) a5; | |||
| va6 = (__vector float*) a6; | |||
| va7 = (__vector float*) a7; | |||
| v_x = (__vector float*) x; | |||
| for (i = 0; i < n/4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| temp4 += v_x[i] * va4[i]; | |||
| temp5 += v_x[i] * va5[i]; | |||
| temp6 += v_x[i] * va6[i]; | |||
| temp7 += v_x[i] * va7[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); | |||
| y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); | |||
| y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); | |||
| y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i = 0; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* va2 = (__vector float*) a2; | |||
| __vector float* va3 = (__vector float*) a3; | |||
| __vector float* v_x = (__vector float*) x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| } | |||
| static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| __vector float temp1 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0; | |||
| a0 = ap; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i] ; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| } | |||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
| BLASLONG i; | |||
| for (i = 0; i < n; i++) { | |||
| *dest++ = *src; | |||
| src += inc_src; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| FLOAT ybuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *xbuffer; | |||
| if (m < 1) return (0); | |||
| if (n < 1) return (0); | |||
| xbuffer = buffer; | |||
| n1 = n >> 3; | |||
| n2 = n & 7; | |||
| m3 = m & 3; | |||
| m1 = m - m3; | |||
| m2 = (m & (NBMAX - 1)) - m3; | |||
| BLASLONG NB = NBMAX; | |||
| while (NB == NBMAX) { | |||
| m1 -= NB; | |||
| if (m1 < 0) { | |||
| if (m2 == 0) break; | |||
| NB = m2; | |||
| } | |||
| y_ptr = y; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| if (inc_x != 1) | |||
| copy_x(NB, x_ptr, xbuffer, inc_x); | |||
| else | |||
| xbuffer = x_ptr; | |||
| BLASLONG lda8 = lda << 3; | |||
| if (inc_y == 1) { | |||
| for (i = 0; i < n1; i++) { | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); | |||
| y_ptr += 8; | |||
| a_ptr += lda8; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n1; i++) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| ybuffer[2] = 0; | |||
| ybuffer[3] = 0; | |||
| ybuffer[4] = 0; | |||
| ybuffer[5] = 0; | |||
| ybuffer[6] = 0; | |||
| ybuffer[7] = 0; | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[2]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[4]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[5]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[6]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[7]; | |||
| y_ptr += inc_y; | |||
| a_ptr += lda8; | |||
| } | |||
| } | |||
| if (n2 & 4) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| ybuffer[2] = 0; | |||
| ybuffer[3] = 0; | |||
| sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| a_ptr += lda<<2; | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[2]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| } | |||
| if (n2 & 2) { | |||
| sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); | |||
| a_ptr += lda << 1; | |||
| y_ptr += 2 * inc_y; | |||
| } | |||
| if (n2 & 1) { | |||
| sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| } | |||
| a += NB; | |||
| x += NB * inc_x; | |||
| } | |||
| if (m3 == 0) return (0); | |||
| x_ptr = x; | |||
| a_ptr = a; | |||
| if (m3 == 3) { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp2 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 3 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||
| y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||
| y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||
| aj += 12; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| aj += 3; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| if (m3 == 2) { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 2 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; | |||
| y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; | |||
| y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; | |||
| aj += 8; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| aj += 2; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| FLOAT xtemp = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 1 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| y_ptr[j + 1] += aj[j + 1] * xtemp; | |||
| y_ptr[j + 2] += aj[j + 2] * xtemp; | |||
| y_ptr[j + 3] += aj[j + 3] * xtemp; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| #endif | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_t.c" | |||
| #else | |||
| #include "common.h" | |||
| #define NBMAX 2048 | |||
| #include <altivec.h> | |||
| static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||
| __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| register __vector float temp4 = {0,0,0,0}; | |||
| register __vector float temp5 = {0,0,0,0}; | |||
| register __vector float temp6 = {0,0,0,0}; | |||
| register __vector float temp7 = {0,0,0,0}; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| a4 = a3 + lda; | |||
| a5 = a4 + lda; | |||
| a6 = a5 + lda; | |||
| a7 = a6 + lda; | |||
| va0 = (__vector float*) a0; | |||
| va1 = (__vector float*) a1; | |||
| va2 = (__vector float*) a2; | |||
| va3 = (__vector float*) a3; | |||
| va4 = (__vector float*) a4; | |||
| va5 = (__vector float*) a5; | |||
| va6 = (__vector float*) a6; | |||
| va7 = (__vector float*) a7; | |||
| v_x = (__vector float*) x; | |||
| for (i = 0; i < n/4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| temp4 += v_x[i] * va4[i]; | |||
| temp5 += v_x[i] * va5[i]; | |||
| temp6 += v_x[i] * va6[i]; | |||
| temp7 += v_x[i] * va7[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); | |||
| y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); | |||
| y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); | |||
| y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i = 0; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* va2 = (__vector float*) a2; | |||
| __vector float* va3 = (__vector float*) a3; | |||
| __vector float* v_x = (__vector float*) x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| } | |||
| static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| __vector float temp1 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0; | |||
| a0 = ap; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i] ; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| } | |||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
| BLASLONG i; | |||
| for (i = 0; i < n; i++) { | |||
| *dest++ = *src; | |||
| src += inc_src; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| FLOAT ybuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *xbuffer; | |||
| if (m < 1) return (0); | |||
| if (n < 1) return (0); | |||
| xbuffer = buffer; | |||
| n1 = n >> 3; | |||
| n2 = n & 7; | |||
| m3 = m & 3; | |||
| m1 = m - m3; | |||
| m2 = (m & (NBMAX - 1)) - m3; | |||
| BLASLONG NB = NBMAX; | |||
| while (NB == NBMAX) { | |||
| m1 -= NB; | |||
| if (m1 < 0) { | |||
| if (m2 == 0) break; | |||
| NB = m2; | |||
| } | |||
| y_ptr = y; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| if (inc_x != 1) | |||
| copy_x(NB, x_ptr, xbuffer, inc_x); | |||
| else | |||
| xbuffer = x_ptr; | |||
| BLASLONG lda8 = lda << 3; | |||
| if (inc_y == 1) { | |||
| for (i = 0; i < n1; i++) { | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); | |||
| y_ptr += 8; | |||
| a_ptr += lda8; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n1; i++) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| ybuffer[2] = 0; | |||
| ybuffer[3] = 0; | |||
| ybuffer[4] = 0; | |||
| ybuffer[5] = 0; | |||
| ybuffer[6] = 0; | |||
| ybuffer[7] = 0; | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[2]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[4]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[5]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[6]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[7]; | |||
| y_ptr += inc_y; | |||
| a_ptr += lda8; | |||
| } | |||
| } | |||
| if (n2 & 4) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| ybuffer[2] = 0; | |||
| ybuffer[3] = 0; | |||
| sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| a_ptr += lda<<2; | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[2]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| } | |||
| if (n2 & 2) { | |||
| sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); | |||
| a_ptr += lda << 1; | |||
| y_ptr += 2 * inc_y; | |||
| } | |||
| if (n2 & 1) { | |||
| sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| } | |||
| a += NB; | |||
| x += NB * inc_x; | |||
| } | |||
| if (m3 == 0) return (0); | |||
| x_ptr = x; | |||
| a_ptr = a; | |||
| if (m3 == 3) { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp2 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 3 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||
| y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||
| y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||
| aj += 12; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| aj += 3; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| if (m3 == 2) { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 2 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; | |||
| y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; | |||
| y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; | |||
| aj += 8; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| aj += 2; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| FLOAT xtemp = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 1 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| y_ptr[j + 1] += aj[j + 1] * xtemp; | |||
| y_ptr[j + 2] += aj[j + 2] * xtemp; | |||
| y_ptr[j + 3] += aj[j + 3] * xtemp; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| #endif | |||
| @@ -1,245 +1,245 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE 512 | |||
| #define FZERO 312+192(SP) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #define o0 0 | |||
| #define alpha_r vs30 | |||
| #define alpha_i vs31 | |||
| #define VECSAVE r11 | |||
| #define FRAMEPOINTER r12 | |||
| #define T10 r14 | |||
| #define L r15 | |||
| #define T8 r16 | |||
| #define T5 r17 | |||
| #define T2 r19 | |||
| #define TEMP_REG r20 | |||
| #define T6 r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T7 r27 | |||
| #define T3 r28 | |||
| #define T4 r29 | |||
| #define PRE r30 | |||
| #define T1 r31 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| mr FRAMEPOINTER, SP | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| xxspltd alpha_r,vs1,0 /*copy from register f1 */ | |||
| xxspltd alpha_i,vs2,0 /*copy from register f2 */ | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| #if defined(linux) || defined(__FreeBSD__) | |||
| ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #ifdef TRMMKERNEL | |||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| #include "zgemm_macros_power9.S" | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| li PRE, 512 | |||
| li r0, 0 | |||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
| /*negate for this case as we will use addition -1*(a+b) */ | |||
| xvnegdp alpha_r,alpha_r | |||
| xvnegdp alpha_i,alpha_i | |||
| #endif | |||
| .align 4 | |||
| #include "zgemm_logic_power9.S" | |||
| L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE 512 | |||
| #define FZERO 312+192(SP) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #define o0 0 | |||
| #define alpha_r vs30 | |||
| #define alpha_i vs31 | |||
| #define VECSAVE r11 | |||
| #define FRAMEPOINTER r12 | |||
| #define T10 r14 | |||
| #define L r15 | |||
| #define T8 r16 | |||
| #define T5 r17 | |||
| #define T2 r19 | |||
| #define TEMP_REG r20 | |||
| #define T6 r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T7 r27 | |||
| #define T3 r28 | |||
| #define T4 r29 | |||
| #define PRE r30 | |||
| #define T1 r31 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| mr FRAMEPOINTER, SP | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| xxspltd alpha_r,vs1,0 /*copy from register f1 */ | |||
| xxspltd alpha_i,vs2,0 /*copy from register f2 */ | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| #if defined(linux) || defined(__FreeBSD__) | |||
| ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #ifdef TRMMKERNEL | |||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| #include "zgemm_macros_power9.S" | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| li PRE, 512 | |||
| li r0, 0 | |||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
| /*negate for this case as we will use addition -1*(a+b) */ | |||
| xvnegdp alpha_r,alpha_r | |||
| xvnegdp alpha_i,alpha_i | |||
| #endif | |||
| .align 4 | |||
| #include "zgemm_logic_power9.S" | |||
| L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -62,6 +62,8 @@ gotoblas_t TABLE_NAME = { | |||
| MAX(SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N), | |||
| #endif | |||
| SBGEMM_ALIGN_K, | |||
| sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, | |||
| samax_kTS, samin_kTS, smax_kTS, smin_kTS, | |||
| @@ -866,7 +868,7 @@ gotoblas_t TABLE_NAME = { | |||
| cgeadd_kTS, | |||
| #endif | |||
| #if BUILD_COMPLEX16==1 | |||
| zgeadd_kTS | |||
| zgeadd_kTS, | |||
| #endif | |||
| }; | |||
| @@ -44,8 +44,5 @@ DGEMM_BETA = dgemm_beta_skylakex.c | |||
| CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c | |||
| ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c | |||
| CSCALKERNEL = ../arm/zscal.c | |||
| ZSCALKERNEL = ../arm/zscal.c | |||
| CASUMKERNEL = casum.c | |||
| ZASUMKERNEL = zasum.c | |||