| @@ -30,6 +30,15 @@ task: | |||
| - cd build | |||
| - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. | |||
| - make | |||
| task: | |||
| name: AppleM1/GCC/MAKE/OPENMP | |||
| compile_script: | |||
| - brew install gcc@11 | |||
| - export PATH=/opt/homebrew/bin:$PATH | |||
| - export LDFLAGS="-L/opt/homebrew/lib" | |||
| - export CPPFLAGS="-I/opt/homebrew/include" | |||
| - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 | |||
| macos_instance: | |||
| image: ghcr.io/cirruslabs/macos-monterey-xcode:latest | |||
| @@ -151,40 +151,53 @@ jobs: | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| msystem: [MINGW64, MINGW32, CLANG64] | |||
| msystem: [MINGW64, MINGW32, CLANG64, CLANG32] | |||
| idx: [int32, int64] | |||
| build-type: [Release] | |||
| include: | |||
| - msystem: MINGW64 | |||
| idx: int32 | |||
| target-prefix: mingw-w64-x86_64 | |||
| fc-pkg: mingw-w64-x86_64-gcc-fortran | |||
| fc-pkg: fc | |||
| - msystem: MINGW32 | |||
| idx: int32 | |||
| target-prefix: mingw-w64-i686 | |||
| fc-pkg: mingw-w64-i686-gcc-fortran | |||
| fc-pkg: fc | |||
| - msystem: CLANG64 | |||
| idx: int32 | |||
| target-prefix: mingw-w64-clang-x86_64 | |||
| fc-pkg: fc | |||
| # Compiling with Flang 16 seems to cause test errors on machines | |||
| # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. | |||
| no-avx512-flags: -DNO_AVX512=1 | |||
| - msystem: CLANG32 | |||
| idx: int32 | |||
| target-prefix: mingw-w64-clang-i686 | |||
| fc-pkg: cc | |||
| c-lapack-flags: -DC_LAPACK=ON | |||
| - msystem: MINGW64 | |||
| idx: int64 | |||
| idx64-flags: -DBINARY=64 -DINTERFACE64=1 | |||
| target-prefix: mingw-w64-x86_64 | |||
| fc-pkg: mingw-w64-x86_64-gcc-fortran | |||
| fc-pkg: fc | |||
| - msystem: CLANG64 | |||
| idx: int64 | |||
| idx64-flags: -DBINARY=64 -DINTERFACE64=1 | |||
| target-prefix: mingw-w64-clang-x86_64 | |||
| c-lapack-flags: -DC_LAPACK=ON | |||
| fc-pkg: fc | |||
| # Compiling with Flang 16 seems to cause test errors on machines | |||
| # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. | |||
| no-avx512-flags: -DNO_AVX512=1 | |||
| - msystem: MINGW64 | |||
| idx: int32 | |||
| target-prefix: mingw-w64-x86_64 | |||
| fc-pkg: mingw-w64-x86_64-gcc-fortran | |||
| fc-pkg: fc | |||
| build-type: None | |||
| exclude: | |||
| - msystem: MINGW32 | |||
| idx: int64 | |||
| - msystem: CLANG32 | |||
| idx: int64 | |||
| defaults: | |||
| run: | |||
| @@ -209,7 +222,7 @@ jobs: | |||
| install: >- | |||
| base-devel | |||
| ${{ matrix.target-prefix }}-cc | |||
| ${{ matrix.fc-pkg }} | |||
| ${{ matrix.target-prefix }}-${{ matrix.fc-pkg }} | |||
| ${{ matrix.target-prefix }}-cmake | |||
| ${{ matrix.target-prefix }}-ninja | |||
| ${{ matrix.target-prefix }}-ccache | |||
| @@ -261,6 +274,7 @@ jobs: | |||
| -DTARGET=CORE2 \ | |||
| ${{ matrix.idx64-flags }} \ | |||
| ${{ matrix.c-lapack-flags }} \ | |||
| ${{ matrix.no-avx512-flags }} \ | |||
| -DCMAKE_C_COMPILER_LAUNCHER=ccache \ | |||
| -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ | |||
| .. | |||
| @@ -280,9 +294,22 @@ jobs: | |||
| key: ${{ steps.ccache-prepare.outputs.key }} | |||
| - name: Run tests | |||
| id: run-ctest | |||
| timeout-minutes: 60 | |||
| run: cd build && ctest | |||
| - name: Re-run tests | |||
| if: always() && (steps.run-ctest.outcome == 'failure') | |||
| timeout-minutes: 60 | |||
| run: | | |||
| cd build | |||
| echo "::group::Re-run ctest" | |||
| ctest --rerun-failed --output-on-failure || true | |||
| echo "::endgroup::" | |||
| echo "::group::Log from these tests" | |||
| [ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log | |||
| echo "::endgroup::" | |||
| cross_build: | |||
| runs-on: ubuntu-22.04 | |||
| @@ -0,0 +1,110 @@ | |||
| name: loongarch64 qemu test | |||
| on: [push, pull_request] | |||
| jobs: | |||
| TEST: | |||
| runs-on: ubuntu-latest | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| include: | |||
| - target: LOONGSONGENERIC | |||
| triple: loongarch64-unknown-linux-gnu | |||
| opts: NO_SHARED=1 TARGET=LOONGSONGENERIC | |||
| - target: LOONGSON3R5 | |||
| triple: loongarch64-unknown-linux-gnu | |||
| opts: NO_SHARED=1 TARGET=LOONGSON3R5 | |||
| - target: LOONGSON2K1000 | |||
| triple: loongarch64-unknown-linux-gnu | |||
| opts: NO_SHARED=1 TARGET=LOONGSON2K1000 | |||
| steps: | |||
| - name: Checkout repository | |||
| uses: actions/checkout@v3 | |||
| - name: Install APT deps | |||
| run: | | |||
| sudo add-apt-repository ppa:savoury1/virtualisation | |||
| sudo apt-get update | |||
| sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ | |||
| qemu-user-static | |||
| - name: Download and install loongarch64-toolchain | |||
| run: | | |||
| wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz | |||
| tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt | |||
| - name: Set env | |||
| run: | | |||
| echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV | |||
| echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV | |||
| - name: Compilation cache | |||
| uses: actions/cache@v3 | |||
| with: | |||
| path: ~/.ccache | |||
| key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} | |||
| restore-keys: | | |||
| ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} | |||
| ccache-${{ runner.os }}-${{ matrix.target }} | |||
| - name: Configure ccache | |||
| run: | | |||
| test -d ~/.ccache || mkdir -p ~/.ccache | |||
| echo "max_size = 300M" > ~/.ccache/ccache.conf | |||
| echo "compression = true" >> ~/.ccache/ccache.conf | |||
| ccache -s | |||
| - name: Disable utest dsdot:dsdot_n_1 | |||
| run: | | |||
| echo -n > utest/test_dsdot.c | |||
| echo "Due to the qemu versions 7.2 causing utest cases to fail," | |||
| echo "the utest dsdot:dsdot_n_1 have been temporarily disabled." | |||
| - name: Build OpenBLAS | |||
| run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) | |||
| - name: Test | |||
| run: | | |||
| qemu-loongarch64-static ./utest/openblas_utest | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1 | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1 | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1 | |||
| rm -f ./test/?BLAT2.SUMM | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat | |||
| rm -f ./test/?BLAT2.SUMM | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat | |||
| rm -f ./test/?BLAT3.SUMM | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat | |||
| OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat | |||
| rm -f ./test/?BLAT3.SUMM | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat | |||
| OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat | |||
| @@ -72,6 +72,7 @@ test/SBLAT3.SUMM | |||
| test/ZBLAT2.SUMM | |||
| test/ZBLAT3.SUMM | |||
| test/SHBLAT3.SUMM | |||
| test/SBBLAT3.SUMM | |||
| test/cblat1 | |||
| test/cblat2 | |||
| test/cblat3 | |||
| @@ -82,6 +83,7 @@ test/sblat1 | |||
| test/sblat2 | |||
| test/sblat3 | |||
| test/test_shgemm | |||
| test/test_sbgemm | |||
| test/zblat1 | |||
| test/zblat2 | |||
| test/zblat3 | |||
| @@ -7,7 +7,7 @@ pipeline { | |||
| stages { | |||
| stage('Build') { | |||
| steps { | |||
| sh 'make' | |||
| sh 'make clean && make' | |||
| } | |||
| } | |||
| } | |||
| @@ -9,7 +9,7 @@ pipeline { | |||
| steps { | |||
| sh 'sudo apt update' | |||
| sh 'sudo apt install gfortran -y' | |||
| sh 'make' | |||
| sh 'make clean && make' | |||
| } | |||
| } | |||
| } | |||
| @@ -384,6 +384,11 @@ GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) | |||
| endif | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | |||
| CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12) | |||
| endif | |||
| # | |||
| # OS dependent settings | |||
| # | |||
| @@ -668,6 +673,7 @@ DYNAMIC_CORE += NEOVERSEN1 | |||
| ifneq ($(NO_SVE), 1) | |||
| DYNAMIC_CORE += NEOVERSEV1 | |||
| DYNAMIC_CORE += NEOVERSEN2 | |||
| DYNAMIC_CORE += ARMV8SVE | |||
| endif | |||
| DYNAMIC_CORE += CORTEXA55 | |||
| DYNAMIC_CORE += FALKOR | |||
| @@ -1086,8 +1092,9 @@ endif | |||
| endif | |||
| endif | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW)) | |||
| CCOMMON_OPT += -DF_INTERFACE_GFORT | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| FCOMMON_OPT += -Wall | |||
| # make single-threaded LAPACK calls thread-safe #1847 | |||
| FCOMMON_OPT += -frecursive | |||
| @@ -1101,6 +1108,7 @@ EXTRALIB += -lgfortran | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| ifdef NO_BINARY_MODE | |||
| ifeq ($(ARCH), $(filter $(ARCH),mips64)) | |||
| ifdef BINARY64 | |||
| @@ -1767,6 +1775,8 @@ export TARGET_CORE | |||
| export NO_AVX512 | |||
| export NO_AVX2 | |||
| export BUILD_BFLOAT16 | |||
| export NO_LSX | |||
| export NO_LASX | |||
| export SBGEMM_UNROLL_M | |||
| export SBGEMM_UNROLL_N | |||
| @@ -75,18 +75,31 @@ endif | |||
| ifeq ($(CORE), COOPERLAKE) | |||
| ifndef NO_AVX512 | |||
| ifeq ($(C_COMPILER), GCC) | |||
| # cooperlake support was added in 10.1 | |||
| ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) | |||
| CCOMMON_OPT += -march=cooperlake | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=cooperlake | |||
| endif | |||
| else # gcc not support, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| # cooperlake support was added in 10.1 | |||
| ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) | |||
| CCOMMON_OPT += -march=cooperlake | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=cooperlake | |||
| endif | |||
| else # gcc not support, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| else ifeq ($(C_COMPILER), CLANG) | |||
| # cooperlake support was added in clang 9 | |||
| ifeq ($(CLANGVERSIONGTEQ9), 1) | |||
| CCOMMON_OPT += -march=cooperlake | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=cooperlake | |||
| endif | |||
| else # not supported in clang, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| @@ -104,18 +117,31 @@ endif | |||
| ifeq ($(CORE), SAPPHIRERAPIDS) | |||
| ifndef NO_AVX512 | |||
| ifeq ($(C_COMPILER), GCC) | |||
| # sapphire rapids support was added in 11 | |||
| ifeq ($(GCCVERSIONGTEQ11), 1) | |||
| CCOMMON_OPT += -march=sapphirerapids | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=sapphirerapids | |||
| endif | |||
| else # gcc not support, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| # sapphire rapids support was added in 11 | |||
| ifeq ($(GCCVERSIONGTEQ11), 1) | |||
| CCOMMON_OPT += -march=sapphirerapids | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=sapphirerapids | |||
| endif | |||
| else # gcc not support, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| else ifeq ($(C_COMPILER), CLANG) | |||
| # cooperlake support was added in clang 12 | |||
| ifeq ($(CLANGVERSIONGTEQ12), 1) | |||
| CCOMMON_OPT += -march=cooperlake | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=cooperlake | |||
| endif | |||
| else # not supported in clang, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| @@ -271,6 +271,19 @@ jobs: | |||
| - script: | | |||
| make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 | |||
| - job: OSX_xbuild_DYNAMIC_ARM64 | |||
| pool: | |||
| vmImage: 'macOS-11' | |||
| variables: | |||
| CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64 | |||
| steps: | |||
| - script: | | |||
| ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs | |||
| /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus | |||
| /Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version | |||
| make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 | |||
| - job: ALPINE_MUSL | |||
| pool: | |||
| vmImage: 'ubuntu-latest' | |||
| @@ -185,6 +185,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then | |||
| rm -rf "$tmpd" | |||
| fi | |||
| no_lsx=0 | |||
| no_lasx=0 | |||
| if [ "$architecture" = "loongarch64" ]; then | |||
| tmpd="$(mktemp -d)" | |||
| tmplsx="$tmpd/lsx.c" | |||
| codelsx='"vadd.b $vr0, $vr0, $vr0"' | |||
| lsx_flags='-march=loongarch64 -mlsx' | |||
| printf "#include <lsxintrin.h>\n\n" >> "$tmplsx" | |||
| printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" | |||
| args="$lsx_flags -o $tmplsx.o $tmplsx" | |||
| { | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| no_lsx=1 | |||
| } | |||
| tmplasx="$tmpd/lasx.c" | |||
| codelasx='"xvadd.b $xr0, $xr0, $xr0"' | |||
| lasx_flags='-march=loongarch64 -mlasx' | |||
| printf "#include <lasxintrin.h>\n\n" >> "$tmplasx" | |||
| printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" | |||
| args="$lasx_flags -o $tmplasx.o $tmplasx" | |||
| { | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| no_lasx=1 | |||
| } | |||
| rm -rf "$tmpd" | |||
| fi | |||
| case "$data" in | |||
| *ARCH_X86_64*) architecture=x86_64 ;; | |||
| *ARCH_X86*) architecture=x86 ;; | |||
| @@ -252,6 +283,9 @@ if [ "$architecture" = "arm64" ]; then | |||
| no_sve=0 | |||
| { | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| args=" -Msve_intrinsics -c -o $tmpf.o $tmpf" | |||
| $compiler_name $flags $args >/dev/null 2>&1 | |||
| } || { | |||
| no_sve=1 | |||
| } | |||
| @@ -399,6 +433,8 @@ done | |||
| [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" | |||
| [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" | |||
| [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" | |||
| [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" | |||
| [ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n" | |||
| } >> "$makefile" | |||
| os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ ` | |||
| @@ -414,6 +450,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' ` | |||
| [ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu" | |||
| [ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n" | |||
| [ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n" | |||
| [ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n" | |||
| [ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n" | |||
| } >> "$config" | |||
| @@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||
| } | |||
| } | |||
| $no_lsx = 0; | |||
| $no_lasx = 0; | |||
| if (($architecture eq "loongarch64")) { | |||
| eval "use File::Temp qw(tempfile)"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility"; | |||
| } else { | |||
| $tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); | |||
| $codelsx = '"vadd.b $vr0, $vr0, $vr0"'; | |||
| $lsx_flags = "-march=loongarch64 -mlsx"; | |||
| print $tmplsx "#include <lsxintrin.h>\n\n"; | |||
| print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n"; | |||
| $args = "$lsx_flags -o $tmplsx.o $tmplsx"; | |||
| my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $no_lsx = 1; | |||
| } else { | |||
| $no_lsx = 0; | |||
| } | |||
| unlink("$tmplsx.o"); | |||
| $tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); | |||
| $codelasx = '"xvadd.b $xr0, $xr0, $xr0"'; | |||
| $lasx_flags = "-march=loongarch64 -mlasx"; | |||
| print $tmplasx "#include <lasxintrin.h>\n\n"; | |||
| print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n"; | |||
| $args = "$lasx_flags -o $tmplasx.o $tmplasx"; | |||
| my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $no_lasx = 1; | |||
| } else { | |||
| $no_lasx = 0; | |||
| } | |||
| unlink("$tmplasx.o"); | |||
| } | |||
| } | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| $architecture = e2k if ($data =~ /ARCH_E2K/); | |||
| @@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1; | |||
| print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; | |||
| print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1; | |||
| print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1; | |||
| print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1; | |||
| print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1; | |||
| $os =~ tr/[a-z]/[A-Z]/; | |||
| $architecture =~ tr/[a-z]/[A-Z]/; | |||
| @@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; | |||
| print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; | |||
| print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; | |||
| print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; | |||
| print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1; | |||
| print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1; | |||
| if ($os eq "LINUX") { | |||
| @@ -46,7 +46,7 @@ if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) | |||
| endif () | |||
| if (DYNAMIC_LIST) | |||
| set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | |||
| @@ -135,7 +135,7 @@ if (ARM64) | |||
| set(BINARY_DEFINED 1) | |||
| endif () | |||
| if (${ARCH} STREQUAL "riscv64") | |||
| if (RISCV64) | |||
| set(NO_BINARY_MODE 1) | |||
| set(BINARY_DEFINED 1) | |||
| endif () | |||
| @@ -180,22 +180,30 @@ endif () | |||
| if (${CORE} STREQUAL NEOVERSEN2) | |||
| if (NOT DYNAMIC_ARCH) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| endif() | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| endif() | |||
| endif () | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL NEOVERSEV1) | |||
| if (NOT DYNAMIC_ARCH) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| endif() | |||
| endif() | |||
| endif () | |||
| endif () | |||
| @@ -213,7 +221,11 @@ endif () | |||
| if (${CORE} STREQUAL ARMV8SVE) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| endif () | |||
| endif () | |||
| endif () | |||
| @@ -3,7 +3,8 @@ | |||
| ## Description: Ported from portion of OpenBLAS/Makefile.system | |||
| ## Sets Fortran related variables. | |||
| if (${F_COMPILER} STREQUAL "FLANG") | |||
| if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") | |||
| # This is for classic Flang. LLVM Flang is handled with gfortran below. | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") | |||
| if (BINARY64 AND INTERFACE64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -i8") | |||
| @@ -38,15 +39,17 @@ if (${F_COMPILER} STREQUAL "G95") | |||
| endif () | |||
| endif () | |||
| if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95") | |||
| if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") | |||
| # ensure reentrancy of lapack codes | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") | |||
| # work around ABI violation in passing string arguments from C | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") | |||
| #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc | |||
| if (NOT NO_LAPACK) | |||
| set(EXTRALIB "${EXTRALIB} -lgfortran") | |||
| if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") | |||
| # ensure reentrancy of lapack codes | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") | |||
| # work around ABI violation in passing string arguments from C | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") | |||
| if (NOT NO_LAPACK) | |||
| # Don't include -lgfortran, when NO_LAPACK=1 or lsbcc | |||
| set(EXTRALIB "${EXTRALIB} -lgfortran") | |||
| endif () | |||
| endif () | |||
| if (NO_BINARY_MODE) | |||
| if (MIPS64) | |||
| @@ -63,6 +66,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") | |||
| endif () | |||
| endif () | |||
| if (RISCV64) | |||
| if (BINARY64) | |||
| if (INTERFACE64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") | |||
| endif () | |||
| endif () | |||
| endif () | |||
| else () | |||
| if (BINARY64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -m64") | |||
| @@ -282,23 +282,35 @@ if (DEFINED TARGET) | |||
| endif() | |||
| if (${TARGET} STREQUAL NEOVERSEV1) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
| else () | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1") | |||
| else () | |||
| message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL NEOVERSEN2) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
| else () | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") | |||
| else () | |||
| message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL ARMV8SVE) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve") | |||
| else () | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| @@ -44,6 +44,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") | |||
| set(MIPS64 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") | |||
| set(LOONGARCH64 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*") | |||
| set(RISCV64 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") | |||
| if (NOT BINARY) | |||
| if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | |||
| @@ -60,7 +62,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") | |||
| endif() | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") | |||
| set(X86 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)") | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)") | |||
| if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | |||
| set(ARM64 1) | |||
| else() | |||
| @@ -107,7 +109,7 @@ else() | |||
| endif () | |||
| if (NOT BINARY) | |||
| if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64) | |||
| if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64) | |||
| set(BINARY 64) | |||
| else () | |||
| set(BINARY 32) | |||
| @@ -53,7 +53,6 @@ extern void goto_set_num_threads(int nthreads); | |||
| /* Global Parameter */ | |||
| extern int blas_cpu_number; | |||
| extern int blas_num_threads; | |||
| extern int blas_num_threads_set; | |||
| extern int blas_omp_linked; | |||
| #define BLAS_LEGACY 0x8000U | |||
| @@ -136,15 +135,13 @@ typedef struct blas_queue { | |||
| #ifdef SMP_SERVER | |||
| extern int blas_server_avail; | |||
| extern int blas_omp_number_max; | |||
| static __inline int num_cpu_avail(int level) { | |||
| #ifdef USE_OPENMP | |||
| int openmp_nthreads; | |||
| if (blas_num_threads_set == 0) | |||
| openmp_nthreads=omp_get_max_threads(); | |||
| else | |||
| openmp_nthreads=blas_cpu_number; | |||
| #endif | |||
| #ifndef USE_OPENMP | |||
| @@ -156,7 +153,13 @@ int openmp_nthreads; | |||
| ) return 1; | |||
| #ifdef USE_OPENMP | |||
| if (blas_cpu_number != openmp_nthreads) { | |||
| if (openmp_nthreads > blas_omp_number_max){ | |||
| #ifdef DEBUG | |||
| fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max); | |||
| #endif | |||
| openmp_nthreads = blas_omp_number_max; | |||
| } | |||
| if (blas_cpu_number != openmp_nthreads) { | |||
| goto_set_num_threads(openmp_nthreads); | |||
| } | |||
| #endif | |||
| @@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdint.h> | |||
| #include <sys/auxv.h> | |||
| /* If LASX extension instructions supported, | |||
| * using core LOONGSON3R5 | |||
| @@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CPU_LOONGSON3R5 1 | |||
| #define CPU_LOONGSON2K1000 2 | |||
| #define LOONGARCH_CFG2 0x02 | |||
| #define LOONGARCH_LASX 1<<7 | |||
| #define LOONGARCH_LSX 1<<6 | |||
| #define LA_HWCAP_LSX (1<<4) | |||
| #define LA_HWCAP_LASX (1<<5) | |||
| static char *cpuname[] = { | |||
| "LOONGSONGENERIC", | |||
| @@ -64,17 +64,11 @@ static char *cpuname_lower[] = { | |||
| int detect(void) { | |||
| #ifdef __linux | |||
| uint32_t reg = 0; | |||
| int flag = (int)getauxval(AT_HWCAP); | |||
| __asm__ volatile ( | |||
| "cpucfg %0, %1 \n\t" | |||
| : "+&r"(reg) | |||
| : "r"(LOONGARCH_CFG2) | |||
| ); | |||
| if (reg & LOONGARCH_LASX) | |||
| if (flag & LA_HWCAP_LASX) | |||
| return CPU_LOONGSON3R5; | |||
| else if (reg & LOONGARCH_LSX) | |||
| else if (flag & LA_HWCAP_LSX) | |||
| return CPU_LOONGSON2K1000; | |||
| else | |||
| return CPU_GENERIC; | |||
| @@ -1551,6 +1551,7 @@ int get_cpuname(void){ | |||
| case 7: // Raptor Lake | |||
| case 10: | |||
| case 15: | |||
| case 14: // Alder Lake N | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| @@ -2360,6 +2361,7 @@ int get_coretype(void){ | |||
| case 7: // Raptor Lake | |||
| case 10: | |||
| case 15: | |||
| case 14: // Alder Lake N | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| @@ -208,7 +208,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| CEXTRALIB = -lomp | |||
| CEXTRALIB += -lomp | |||
| endif | |||
| endif | |||
| ifeq ($(F_COMPILER), NAG) | |||
| @@ -0,0 +1,270 @@ | |||
| # Guidance for redistributing OpenBLAS | |||
| *We note that this document contains recommendations only - packagers and other | |||
| redistributors are in charge of how OpenBLAS is built and distributed in their | |||
| systems, and may have good reasons to deviate from the guidance given on this | |||
| page. These recommendations are aimed at general packaging systems, with a user | |||
| base that typically is large, open source (or freely available at least), and | |||
| doesn't behave uniformly or that the packager is directly connected with.* | |||
| OpenBLAS has a large number of build-time options which can be used to change | |||
| how it behaves at runtime, how artifacts or symbols are named, etc. Variation | |||
| in build configuration can be necessary to acheive a given end goal within a | |||
| distribution or as an end user. However, such variation can also make it more | |||
| difficult to build on top of OpenBLAS and ship code or other packages in a way | |||
| that works across many different distros. Here we provide guidance about the | |||
| most important build options, what effects they may have when changed, and | |||
| which ones to default to. | |||
| The Make and CMake build systems provide equivalent options and yield more or | |||
| less the same artifacts, but not exactly (the CMake builds are still | |||
| experimental). You can choose either one and the options will function in the | |||
| same way, however the CMake outputs may require some renaming. To review | |||
| available build options, see `Makefile.rule` or `CMakeLists.txt` in the root of | |||
| the repository. | |||
| Build options typically fall into two categories: (a) options that affect the | |||
| user interface, such as library and symbol names or APIs that are made | |||
| available, and (b) options that affect performance and runtime behavior, such | |||
| as threading behavior or CPU architecture-specific code paths. The user | |||
| interface options are more important to keep aligned between distributions, | |||
| while for the performance-related options there are typically more reasons to | |||
| make choices that deviate from the defaults. | |||
| Here are recommendations for user interface related packaging choices where it | |||
| is not likely to be a good idea to deviate (typically these are the default | |||
| settings): | |||
| 1. Include CBLAS. The CBLAS interface is widely used and it doesn't affect | |||
| binary size much, so don't turn it off. | |||
| 2. Include LAPACK and LAPACKE. The LAPACK interface is also widely used, and | |||
| while it does make up a significant part of the binary size of the installed | |||
| library, that does not outweigh the regression in usability when deviating | |||
| from the default here.[^1] | |||
| 3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency | |||
| detection files. These files are used by build systems when users want to | |||
| link against OpenBLAS, and there is no benefit of leaving them out. | |||
| 4. Provide the LP64 interface by default, and if in addition to that you choose | |||
| to provide an ILP64 interface build as well, use a symbol suffix to avoid | |||
| symbol name clashes (see the next section). | |||
| [^1] All major distributions do include LAPACK as of mid 2023 as far as we | |||
| know. Older versions of Arch Linux did not, and that was known to cause | |||
| problems. | |||
| ## ILP64 interface builds | |||
| The LP64 (32-bit integer) interface is the default build, and has | |||
| well-established C and Fortran APIs as determined by the reference (Netlib) | |||
| BLAS and LAPACK libraries. The ILP64 (64-bit integer) interface however does | |||
| not have a standard API: symbol names and shared/static library names can be | |||
| produced in multiple ways, and this tends to make it difficult to use. | |||
| As of today there is an agreed-upon way of choosing names for OpenBLAS between | |||
| a number of key users/redistributors, which is the closest thing to a standard | |||
| that there is now. However, there is an ongoing standardization effort in the | |||
| reference BLAS and LAPACK libraries, which differs from the current OpenBLAS | |||
| agreed-upon convention. In this section we'll aim to explain both. | |||
| Those two methods are fairly similar, and have a key thing in common: *using a | |||
| symbol suffix*. This is good practice; it is recommended that if you distribute | |||
| an ILP64 build, to have it use a symbol suffix containing `64` in the name. | |||
| This avoids potential symbol clashes when different packages which depend on | |||
| OpenBLAS load both an LP64 and an ILP64 library into memory at the same time. | |||
| ### The current OpenBLAS agreed-upon ILP64 convention | |||
| This convention comprises the shared library name and the symbol suffix in the | |||
| shared library. The symbol suffix to use is `64_`, implying that the library | |||
| name will be `libopenblas64_.so` and the symbols in that library end in `64_`. | |||
| The central issue where this was discussed is | |||
| [openblas#646](https://github.com/xianyi/OpenBLAS/issues/646), and adopters | |||
| include Fedora, Julia, NumPy and SciPy - SuiteSparse already used it as well. | |||
| To build shared and static libraries with the currently recommended ILP64 | |||
| conventions with Make: | |||
| ```bash | |||
| $ make INTERFACE64=1 SYMBOLSUFFIX=64_ | |||
| ``` | |||
| This will produce libraries named `libopenblas64_.so|a`, a pkg-config file | |||
| named `openblas64.pc`, and CMake and header files. | |||
| Installing locally and inspecting the output will show a few more details: | |||
| ```bash | |||
| $ make install PREFIX=$PWD/../openblas/make64 INTERFACE64=1 SYMBOLSUFFIX=64_ | |||
| $ tree . # output slightly edited down | |||
| . | |||
| ├── include | |||
| │ ├── cblas.h | |||
| │ ├── f77blas.h | |||
| │ ├── lapacke_config.h | |||
| │ ├── lapacke.h | |||
| │ ├── lapacke_mangling.h | |||
| │ ├── lapacke_utils.h | |||
| │ ├── lapack.h | |||
| │ └── openblas_config.h | |||
| └── lib | |||
| ├── cmake | |||
| │ └── openblas | |||
| │ ├── OpenBLASConfig.cmake | |||
| │ └── OpenBLASConfigVersion.cmake | |||
| ├── libopenblas64_.a | |||
| ├── libopenblas64_.so | |||
| └── pkgconfig | |||
| └── openblas64.pc | |||
| ``` | |||
| A key point are the symbol names. These will equal the LP64 symbol names, then | |||
| (for Fortran only) the compiler mangling, and then the `64_` symbol suffix. | |||
| Hence to obtain the final symbol names, we need to take into account which | |||
| Fortran compiler we are using. For the most common cases (e.g., gfortran, Intel | |||
| Fortran, or Flang), that means appending a single underscore. In that case, the | |||
| result is: | |||
| | base API name | binary symbol name | call from Fortran code | call from C code | | |||
| |---------------|--------------------|------------------------|-----------------------| | |||
| | `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` | | |||
| | `cblas_dgemm` | `cblas_dgemm64_` | n/a | `cblas_dgemm64_(...)` | | |||
| It is quite useful to have these symbol names be as uniform as possible across | |||
| different packaging systems. | |||
| The equivalent build options with CMake are: | |||
| ```bash | |||
| $ mkdir build && cd build | |||
| $ cmake .. -DINTERFACE64=1 -DSYMBOLSUFFIX=64_ -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON | |||
| $ cmake --build . -j | |||
| ``` | |||
| Note that the result is not 100% identical to the Make result. For example, the | |||
| library name ends in `_64` rather than `64_` - it is recommended to rename them | |||
| to match the Make library names (also update the `libsuffix` entry in | |||
| `openblas64.pc` to match that rename). | |||
| ```bash | |||
| $ cmake --install . --prefix $PWD/../../openblas/cmake64 | |||
| $ tree . | |||
| . | |||
| ├── include | |||
| │ └── openblas64 | |||
| │ ├── cblas.h | |||
| │ ├── f77blas.h | |||
| │ ├── lapacke_config.h | |||
| │ ├── lapacke_example_aux.h | |||
| │ ├── lapacke.h | |||
| │ ├── lapacke_mangling.h | |||
| │ ├── lapacke_utils.h | |||
| │ ├── lapack.h | |||
| │ ├── openblas64 | |||
| │ │ └── lapacke_mangling.h | |||
| │ └── openblas_config.h | |||
| └── lib | |||
| ├── cmake | |||
| │ └── OpenBLAS64 | |||
| │ ├── OpenBLAS64Config.cmake | |||
| │ ├── OpenBLAS64ConfigVersion.cmake | |||
| │ ├── OpenBLAS64Targets.cmake | |||
| │ └── OpenBLAS64Targets-noconfig.cmake | |||
| ├── libopenblas_64.a | |||
| ├── libopenblas_64.so -> libopenblas_64.so.0 | |||
| └── pkgconfig | |||
| └── openblas64.pc | |||
| ``` | |||
| ### The upcoming standardized ILP64 convention | |||
| While the `64_` convention above got some adoption, it's slightly hacky and is | |||
| implemented through the use of `objcopy`. An effort is ongoing for a more | |||
| broadly adopted convention in the reference BLAS and LAPACK libraries, using | |||
| (a) the `_64` suffix, and (b) applying that suffix _before_ rather than after | |||
| Fortran compiler mangling. The central issue for this is | |||
| [lapack#666](https://github.com/Reference-LAPACK/lapack/issues/666). | |||
| For the most common cases of compiler mangling (a single `_` appended), the end | |||
| result will be: | |||
| | base API name | binary symbol name | call from Fortran code | call from C code | | |||
| |---------------|--------------------|------------------------|-----------------------| | |||
| | `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` | | |||
| | `cblas_dgemm` | `cblas_dgemm_64` | n/a | `cblas_dgemm_64(...)` | | |||
| For other compiler mangling schemes, replace the trailing `_` by the scheme in use. | |||
| The shared library name for this `_64` convention should be `libopenblas_64.so`. | |||
| Note: it is not yet possible to produce an OpenBLAS build which employs this | |||
| convention! Once reference BLAS and LAPACK with support for `_64` have been | |||
| released, a future OpenBLAS release will support it. For now, please use the | |||
| older `64_` scheme and avoid using the name `libopenblas_64.so`; it should be | |||
| considered reserved for future use of the `_64` standard as prescribed by | |||
| reference BLAS/LAPACK. | |||
| ## Performance and runtime behavior related build options | |||
| For these options there are multiple reasonable or common choices. | |||
| ### Threading related options | |||
| OpenBLAS can be built as a multi-threaded or single-threaded library, with the | |||
| default being multi-threaded. It's expected that the default `libopenblas` | |||
| library is multi-threaded; if you'd like to also distribute single-threaded | |||
| builds, consider naming them `libopenblas_sequential`. | |||
| OpenBLAS can be built with pthreads or OpenMP as the threading model, with the | |||
| default being pthreads. Both options are commonly used, and the choice here | |||
| should not influence the shared library name. The choice will be captured by | |||
| the `.pc` file. E.g.,: | |||
| ```bash | |||
| $ pkg-config --libs openblas | |||
| -fopenmp -lopenblas | |||
| $ cat openblas.pc | |||
| ... | |||
| openblas_config= ... USE_OPENMP=0 MAX_THREADS=24 | |||
| ``` | |||
| The maximum number of threads users will be able to use is determined at build | |||
| time by the `NUM_THREADS` build option. It defaults to 24, and there's a wide | |||
| range of values that are reasonable to use (up to 256). 64 is a typical choice | |||
| here; there is a memory footprint penalty that is linear in `NUM_THREADS`. | |||
| Please see `Makefile.rule` for more details. | |||
| ### CPU architecture related options | |||
| OpenBLAS contains a lot of CPU architecture-specific optimizations, hence when | |||
| distributing to a user base with a variety of hardware, it is recommended to | |||
| enable CPU architecture runtime detection. This will dynamically select | |||
| optimized kernels for individual APIs. To do this, use the `DYNAMIC_ARCH=1` | |||
| build option. This is usually done on all common CPU families, except when | |||
| there are known issues. | |||
| In case the CPU architecture is known (e.g. you're building binaries for macOS | |||
| M1 users), it is possible to specify the target architecture directly with the | |||
| `TARGET=` build option. | |||
| `DYNAMIC_ARCH` and `TARGET` are covered in more detail in the main `README.md` | |||
| in this repository. | |||
| ## Real-world examples | |||
| OpenBLAS is likely to be distributed in one of these distribution models: | |||
| 1. As a standalone package, or multiple packages, in a packaging ecosystem like | |||
| a Linux distro, Homebrew, conda-forge or MSYS2. | |||
| 2. Vendored as part of a larger package, e.g. in Julia, NumPy, SciPy, or R. | |||
| 3. Locally, e.g. making available as a build on a single HPC cluster. | |||
| The guidance on this page is most important for models (1) and (2). These links | |||
| to build recipes for a representative selection of packaging systems may be | |||
| helpful as a reference: | |||
| - [Fedora](https://src.fedoraproject.org/rpms/openblas/blob/rawhide/f/openblas.spec) | |||
| - [Debian](https://salsa.debian.org/science-team/openblas/-/blob/master/debian/rules) | |||
| - [Homebrew](https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/openblas.rb) | |||
| - [MSYS2](https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-openblas/PKGBUILD) | |||
| - [conda-forge](https://github.com/conda-forge/openblas-feedstock/blob/main/recipe/build.sh) | |||
| - [NumPy/SciPy](https://github.com/MacPython/openblas-libs/blob/main/tools/build_openblas.sh) | |||
| - [Nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/development/libraries/science/math/openblas/default.nix) | |||
| @@ -973,7 +973,7 @@ void goto_set_num_threads(int num_threads) { | |||
| increased_threads = 1; | |||
| for(i = blas_num_threads - 1; i < num_threads - 1; i++){ | |||
| for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ | |||
| atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); | |||
| thread_status[i].status = THREAD_STATUS_WAKEUP; | |||
| @@ -68,6 +68,7 @@ | |||
| #endif | |||
| int blas_server_avail = 0; | |||
| int blas_omp_number_max = 0; | |||
| extern int openblas_omp_adaptive_env(); | |||
| @@ -100,8 +101,6 @@ static void adjust_thread_buffers() { | |||
| void goto_set_num_threads(int num_threads) { | |||
| blas_num_threads_set = 1; | |||
| if (num_threads < 0) blas_num_threads_set = 0; | |||
| if (num_threads < 1) num_threads = blas_num_threads; | |||
| if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; | |||
| @@ -125,6 +124,8 @@ void openblas_set_num_threads(int num_threads) { | |||
| } | |||
| int blas_thread_init(void){ | |||
| if(blas_omp_number_max <= 0) | |||
| blas_omp_number_max = omp_get_max_threads(); | |||
| blas_get_cpu_number(); | |||
| @@ -568,7 +568,7 @@ void goto_set_num_threads(int num_threads) | |||
| blas_server_avail = 1; | |||
| } | |||
| for(i = blas_num_threads - 1; i < num_threads - 1; i++){ | |||
| for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ | |||
| blas_threads[i] = CreateThread(NULL, 0, | |||
| blas_thread_server, (void *)i, | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2; | |||
| #else | |||
| #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_ARMV8SVE | |||
| extern gotoblas_t gotoblas_ARMV8SVE; | |||
| #else | |||
| #define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEX_A55 | |||
| extern gotoblas_t gotoblas_CORTEXA55; | |||
| #else | |||
| @@ -128,17 +134,21 @@ extern gotoblas_t gotoblas_NEOVERSEN1; | |||
| #ifndef NO_SVE | |||
| extern gotoblas_t gotoblas_NEOVERSEV1; | |||
| extern gotoblas_t gotoblas_NEOVERSEN2; | |||
| extern gotoblas_t gotoblas_ARMV8SVE; | |||
| #else | |||
| #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 | |||
| #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 | |||
| #define gotoblas_ARMV8SVE gotoblas_ARMV8 | |||
| #endif | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| extern gotoblas_t gotoblas_CORTEXA55; | |||
| #endif | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define FALLBACK_VERBOSE 1 | |||
| #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" | |||
| #define NUM_CORETYPES 13 | |||
| #define NUM_CORETYPES 16 | |||
| /* | |||
| * In case asm/hwcap.h is outdated on the build system, make sure | |||
| @@ -147,6 +157,9 @@ extern void openblas_warning(int verbose, const char * msg); | |||
| #ifndef HWCAP_CPUID | |||
| #define HWCAP_CPUID (1 << 11) | |||
| #endif | |||
| #ifndef HWCAP_SVE | |||
| #define HWCAP_SVE (1 << 22) | |||
| #endif | |||
| #define get_cpu_ftr(id, var) ({ \ | |||
| __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ | |||
| @@ -168,6 +181,7 @@ static char *corename[] = { | |||
| "neoversen2", | |||
| "thunderx3t110", | |||
| "cortexa55", | |||
| "armv8sve", | |||
| "unknown" | |||
| }; | |||
| @@ -187,6 +201,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; | |||
| if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; | |||
| if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; | |||
| if (gotoblas == &gotoblas_ARMV8SVE) return corename[15]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| @@ -221,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||
| case 12: return (&gotoblas_NEOVERSEN2); | |||
| case 13: return (&gotoblas_THUNDERX3T110); | |||
| case 14: return (&gotoblas_CORTEXA55); | |||
| case 15: return (&gotoblas_ARMV8SVE); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| @@ -281,9 +297,17 @@ static gotoblas_t *get_coretype(void) { | |||
| return &gotoblas_NEOVERSEN1; | |||
| #ifndef NO_SVE | |||
| case 0xd49: | |||
| return &gotoblas_NEOVERSEN2; | |||
| if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { | |||
| openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); | |||
| return &gotoblas_NEOVERSEN1; | |||
| } else | |||
| return &gotoblas_NEOVERSEN2; | |||
| case 0xd40: | |||
| return &gotoblas_NEOVERSEV1; | |||
| if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { | |||
| openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); | |||
| return &gotoblas_NEOVERSEN1; | |||
| }else | |||
| return &gotoblas_NEOVERSEV1; | |||
| #endif | |||
| case 0xd05: // Cortex A55 | |||
| return &gotoblas_CORTEXA55; | |||
| @@ -332,6 +356,12 @@ static gotoblas_t *get_coretype(void) { | |||
| snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); | |||
| openblas_warning(1, coremsg); | |||
| } | |||
| #ifndef NO_SVE | |||
| if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { | |||
| return &gotoblas_ARMV8SVE; | |||
| } | |||
| #endif | |||
| return NULL; | |||
| #endif | |||
| } | |||
| @@ -422,8 +422,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s | |||
| */ | |||
| int blas_num_threads = 0; | |||
| int blas_num_threads_set = 0; | |||
| int goto_get_num_procs (void) { | |||
| return blas_cpu_number; | |||
| } | |||
| @@ -1996,8 +1994,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s | |||
| */ | |||
| int blas_num_threads = 0; | |||
| int blas_num_threads_set = 0; | |||
| int goto_get_num_procs (void) { | |||
| return blas_cpu_number; | |||
| } | |||
| @@ -283,7 +283,6 @@ The numbers of threads in the thread pool. | |||
| This value is equal or large than blas_cpu_number. This means some threads are sleep. | |||
| */ | |||
| int blas_num_threads = 0; | |||
| int blas_num_threads_set = 0; | |||
| int goto_get_num_procs (void) { | |||
| return blas_cpu_number; | |||
| @@ -101,7 +101,14 @@ else | |||
| *flang*) | |||
| vendor=FLANG | |||
| openmp='-fopenmp' | |||
| ;; | |||
| data=`$compiler -v 2>&1 > /dev/null ` | |||
| v="${data#*version *}" | |||
| v="${v%%*.}" | |||
| major="${v%%.*}" | |||
| if [ "$major" -ge 17 ]; then | |||
| vendor=FLANGNEW | |||
| fi | |||
| ;; | |||
| *ifort*|*ifx*) | |||
| vendor=INTEL | |||
| openmp='-fopenmp' | |||
| @@ -68,7 +68,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, | |||
| info = 0; | |||
| if (lda < MAX(1, m)) info = 6; | |||
| if (lda < MAX(1, m)) info = 5; | |||
| if (ldc < MAX(1, m)) info = 8; | |||
| if (n < 0) info = 2; | |||
| @@ -54,6 +54,21 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||
| if (n <= 0) return 0.; | |||
| #ifndef COMPLEX | |||
| if (n == 1) | |||
| #ifdef DOUBLE | |||
| return fabs(x[0]); | |||
| #else | |||
| return fabsf(x[0]); | |||
| #endif | |||
| #endif | |||
| if (incx < 0) | |||
| #ifdef COMPLEX | |||
| x -= (n - 1) * incx * 2; | |||
| #else | |||
| x -= (n - 1) * incx; | |||
| #endif | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -82,6 +97,22 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||
| if (n <= 0) return 0.; | |||
| #ifndef COMPLEX | |||
| if (n == 1) | |||
| #ifdef DOUBLE | |||
| return fabs(x[0]); | |||
| #else | |||
| return fabsf(x[0]); | |||
| #endif | |||
| #endif | |||
| if (incx < 0) | |||
| #ifdef COMPLEX | |||
| x -= (n - 1) * incx * 2; | |||
| #else | |||
| x -= (n - 1) * incx; | |||
| #endif | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| @@ -33,7 +33,7 @@ endif | |||
| ifdef TARGET_CORE | |||
| ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| ifeq ($(GCCVERSIONGTEQ11), 1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12))) | |||
| override CFLAGS += -march=sapphirerapids | |||
| else | |||
| override CFLAGS += -march=skylake-avx512 -mavx512f | |||
| @@ -48,7 +48,7 @@ ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | |||
| endif | |||
| else ifeq ($(TARGET_CORE), COOPERLAKE) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(CLANGVERSIONGTEQ9))) | |||
| override CFLAGS += -march=cooperlake | |||
| else | |||
| override CFLAGS += -march=skylake-avx512 -mavx512f | |||
| @@ -77,6 +77,12 @@ else ifeq ($(TARGET_CORE), ZEN) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) | |||
| else ifeq ($(TARGET_CORE), LOONGSON3R4) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) | |||
| else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),) | |||
| ifeq ($(C_COMPILER), PGI) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics | |||
| else | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| endif | |||
| else | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| endif | |||
| @@ -35,6 +35,12 @@ USE_TRMM = 1 | |||
| endif | |||
| endif | |||
| ifneq ($(DYNAMIC_ARCH), 1) | |||
| ifeq ($(TARGET), MIPS64_GENERIC) | |||
| USE_TRMM = 1 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), HASWELL) | |||
| USE_TRMM = 1 | |||
| endif | |||
| @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT absxi = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||
| if (n <= 0 || inc_x == 0) return(0.0); | |||
| if ( n == 1 ) return( ABS(x[0]) ); | |||
| n *= inc_x; | |||
| @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| BLASLONG inc_x2; | |||
| FLOAT temp; | |||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||
| if (n <= 0 || inc_x == 0) return(0.0); | |||
| inc_x2 = 2 * inc_x; | |||
| @@ -57,7 +57,7 @@ CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = daxpy_thunderx2t99.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| @@ -81,45 +81,35 @@ DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SASUMKERNEL = asum.S | |||
| DASUMKERNEL = asum.S | |||
| CASUMKERNEL = casum.S | |||
| ZASUMKERNEL = zasum.S | |||
| SCOPYKERNEL = copy.S | |||
| DCOPYKERNEL = copy.S | |||
| CCOPYKERNEL = copy.S | |||
| ZCOPYKERNEL = copy.S | |||
| SSWAPKERNEL = swap.S | |||
| DSWAPKERNEL = swap.S | |||
| CSWAPKERNEL = swap.S | |||
| ZSWAPKERNEL = swap.S | |||
| ISAMAXKERNEL = iamax.S | |||
| IDAMAXKERNEL = iamax.S | |||
| ICAMAXKERNEL = izamax.S | |||
| IZAMAXKERNEL = izamax.S | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot.S | |||
| ifneq ($(C_COMPILER), PGI) | |||
| SDOTKERNEL = ../generic/dot.c | |||
| else | |||
| SDOTKERNEL = dot.S | |||
| endif | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| else | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| endif | |||
| SASUMKERNEL = sasum_thunderx2t99.c | |||
| DASUMKERNEL = dasum_thunderx2t99.c | |||
| CASUMKERNEL = casum_thunderx2t99.c | |||
| ZASUMKERNEL = zasum_thunderx2t99.c | |||
| SCOPYKERNEL = copy_thunderx2t99.c | |||
| DCOPYKERNEL = copy_thunderx2t99.c | |||
| CCOPYKERNEL = copy_thunderx2t99.c | |||
| ZCOPYKERNEL = copy_thunderx2t99.c | |||
| SSWAPKERNEL = swap_thunderx2t99.S | |||
| DSWAPKERNEL = swap_thunderx2t99.S | |||
| CSWAPKERNEL = swap_thunderx2t99.S | |||
| ZSWAPKERNEL = swap_thunderx2t99.S | |||
| ISAMAXKERNEL = iamax_thunderx2t99.c | |||
| IDAMAXKERNEL = iamax_thunderx2t99.c | |||
| ICAMAXKERNEL = izamax_thunderx2t99.c | |||
| IZAMAXKERNEL = izamax_thunderx2t99.c | |||
| SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| DDOTKERNEL = dot.c | |||
| SDOTKERNEL = dot.c | |||
| CDOTKERNEL = zdot_thunderx2t99.c | |||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| @@ -170,8 +160,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c | |||
| CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| CGEMMINCOPY = cgemm_ncopy_sve_v1.c | |||
| CGEMMITCOPY = cgemm_tcopy_sve_v1.c | |||
| CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
| CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| @@ -194,8 +184,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c | |||
| ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S | |||
| ZGEMMINCOPY = zgemm_ncopy_sve_v1.c | |||
| ZGEMMITCOPY = zgemm_tcopy_sve_v1.c | |||
| ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
| ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| @@ -1,98 +1 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8SVE | |||
| DAXPYKERNEL = daxpy_thunderx2t99.S | |||
| SASUMKERNEL = sasum_thunderx2t99.c | |||
| DASUMKERNEL = dasum_thunderx2t99.c | |||
| CASUMKERNEL = casum_thunderx2t99.c | |||
| ZASUMKERNEL = zasum_thunderx2t99.c | |||
| SCOPYKERNEL = copy_thunderx2t99.c | |||
| DCOPYKERNEL = copy_thunderx2t99.c | |||
| CCOPYKERNEL = copy_thunderx2t99.c | |||
| ZCOPYKERNEL = copy_thunderx2t99.c | |||
| SSWAPKERNEL = swap_thunderx2t99.S | |||
| DSWAPKERNEL = swap_thunderx2t99.S | |||
| CSWAPKERNEL = swap_thunderx2t99.S | |||
| ZSWAPKERNEL = swap_thunderx2t99.S | |||
| ISAMAXKERNEL = iamax_thunderx2t99.c | |||
| IDAMAXKERNEL = iamax_thunderx2t99.c | |||
| ICAMAXKERNEL = izamax_thunderx2t99.c | |||
| IZAMAXKERNEL = izamax_thunderx2t99.c | |||
| SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| DDOTKERNEL = dot.c | |||
| SDOTKERNEL = dot.c | |||
| CDOTKERNEL = zdot_thunderx2t99.c | |||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRMMUNCOPY_M = | |||
| CTRMMLNCOPY_M = | |||
| CTRMMUTCOPY_M = | |||
| CTRMMLTCOPY_M = | |||
| CHEMMLTCOPY_M = | |||
| CHEMMUTCOPY_M = | |||
| CSYMMUCOPY_M = | |||
| CSYMMLCOPY_M = | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| else | |||
| CGEMMINCOPYOBJ = | |||
| CGEMMITCOPYOBJ = | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMCOPYLN_M = | |||
| ZTRSMCOPYLT_M = | |||
| ZTRSMCOPYUN_M = | |||
| ZTRSMCOPYUT_M = | |||
| ZTRMMUNCOPY_M = | |||
| ZTRMMLNCOPY_M = | |||
| ZTRMMUTCOPY_M = | |||
| ZTRMMLTCOPY_M = | |||
| ZHEMMLTCOPY_M = | |||
| ZHEMMUTCOPY_M = | |||
| ZSYMMUCOPY_M = | |||
| ZSYMMLCOPY_M = | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| else | |||
| ZGEMMINCOPYOBJ = | |||
| ZGEMMITCOPYOBJ = | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pB, pB, 32 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_M1 | |||
| @@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| add pB, pB, 32 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_M2 | |||
| @@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri z23.s, p1/m, z2.s, z15.s | |||
| ld1rw z15.s, p0/z, [pB, 28] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| add pB, pB, 32 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_E | |||
| @@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ii z22.s, p1/m, z3.s, z15.s | |||
| OP_ri z23.s, p1/m, z2.s, z15.s | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_SUB | |||
| @@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ii z22.s, p1/m, z1.s, z15.s | |||
| OP_ri z23.s, p1/m, z0.s, z15.s | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| .endm | |||
| .macro SAVEv1x4 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaz_R | |||
| fmls z24.s, p1/m, z17.s, alphaz_I | |||
| @@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| st2w {z26.s, z27.s}, p1, [pCRow1] | |||
| add pCRow1, pCRow1, lanes, lsl #3 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld2w {z28.s, z29.s}, p1/z, [pCRow2] | |||
| fmla z28.s, p1/m, z20.s, alphaz_R | |||
| fmls z28.s, p1/m, z21.s, alphaz_I | |||
| @@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmla z31.s, p1/m, z23.s, alphaz_R | |||
| st2w {z30.s, z31.s}, p1, [pCRow3] | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVEv1x2 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaz_R | |||
| fmls z24.s, p1/m, z17.s, alphaz_I | |||
| @@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| st2w {z26.s, z27.s}, p1, [pCRow1] | |||
| add pCRow1, pCRow1, lanes, lsl #3 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVEv1x1 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2w {z24.s, z25.s}, p1/z, [pCRow0] | |||
| fmla z24.s, p1/m, z16.s, alphaz_R | |||
| fmls z24.s, p1/m, z17.s, alphaz_I | |||
| @@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alphaR, s0 | |||
| dup alphaz_R, alphaR | |||
| fmov alphaI, s1 | |||
| @@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bne .Lcgemm_kernel_L4_Mv1_46 | |||
| .Lcgemm_kernel_L4_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x4 | |||
| .Lcgemm_kernel_L4_Mv1_END: | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, n); | |||
| svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
| uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| do { | |||
| @@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| aoffset += active * lda * 2; | |||
| j += svcntw(); | |||
| pg = svwhilelt_b32(j, n); | |||
| pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, n); | |||
| svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
| uint32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| do { | |||
| @@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| aoffset += active * 2; | |||
| j += svcntw(); | |||
| pg = svwhilelt_b32(j, n); | |||
| pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -50,8 +50,8 @@ static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) { | |||
| BLASLONG sve_width = SVE_WIDTH; | |||
| for (BLASLONG i = 0; i < n; i += sve_width * 2) { | |||
| svbool_t pg_a = SVE_WHILELT(i, n); | |||
| svbool_t pg_b = SVE_WHILELT(i + sve_width, n); | |||
| svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n); | |||
| svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n); | |||
| SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); | |||
| SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); | |||
| @@ -0,0 +1,121 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdint.h> | |||
| #include <stdio.h> | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define COUNT "cntd" | |||
| #define SV_TYPE svfloat64_t | |||
| #define SV_INDEX svuint64_t | |||
| #define SV_INDEXER svindex_u64 | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64 | |||
| #else | |||
| #define COUNT "cntw" | |||
| #define SV_TYPE svfloat32_t | |||
| #define SV_INDEX svuint32_t | |||
| #define SV_INDEXER svindex_u32 | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32 | |||
| #endif | |||
| #define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ | |||
| a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec); \ | |||
| a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec); \ | |||
| svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag)); \ | |||
| a_offset_inner += 2; \ | |||
| b_offset += active * 2; | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
| uint64_t sve_size; | |||
| asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); | |||
| IFLOAT *a_offset, *a_offset_inner, *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2); | |||
| SV_TYPE a_vec_real; | |||
| SV_TYPE a_vec_imag; | |||
| svbool_t pg_true = SV_TRUE(); | |||
| BLASLONG single_vectors_n = n & -sve_size; | |||
| for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = pg_true; | |||
| uint64_t active = sve_size; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| a_offset += sve_size * lda * 2; | |||
| } | |||
| BLASLONG remaining_n = n - single_vectors_n; | |||
| if (remaining_n) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); | |||
| uint64_t active = remaining_n; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -107,7 +107,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { | |||
| BLASLONG remaining_n = n - single_vectors_n; | |||
| if (remaining_n) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = SV_WHILE(0L, remaining_n); | |||
| svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); | |||
| uint64_t active = remaining_n; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| @@ -0,0 +1,115 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdint.h> | |||
| #include <stdio.h> | |||
| #include <arm_sve.h> | |||
| #include "common.h" | |||
| #ifdef DOUBLE | |||
| #define COUNT "cntd" | |||
| #define SV_TYPE svfloat64x2_t | |||
| #define SV_TRUE svptrue_b64 | |||
| #define SV_WHILE svwhilelt_b64 | |||
| #else | |||
| #define COUNT "cntw" | |||
| #define SV_TYPE svfloat32x2_t | |||
| #define SV_TRUE svptrue_b32 | |||
| #define SV_WHILE svwhilelt_b32 | |||
| #endif | |||
| #define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ | |||
| a_vec = svld2(pg, a_offset_inner); \ | |||
| svst2(pg, b_offset, a_vec); \ | |||
| a_offset_inner += lda * 2; \ | |||
| b_offset += active * 2; | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| uint64_t sve_size = svcntw(); | |||
| asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); | |||
| IFLOAT *a_offset, *a_offset_inner, *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| SV_TYPE a_vec; | |||
| svbool_t pg_true = SV_TRUE(); | |||
| BLASLONG single_vectors_n = n & -sve_size; | |||
| for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = pg_true; | |||
| uint64_t active = sve_size; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| a_offset += sve_size * 2; | |||
| } | |||
| BLASLONG remaining_n = n - single_vectors_n; | |||
| if (remaining_n) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); | |||
| uint64_t active = remaining_n; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 2) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| if (m & 1) { | |||
| INNER_COPY(pg, a_offset_inner, b_offset, lda, active); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG remaining_n = n - single_vectors_n; | |||
| if (remaining_n) { | |||
| a_offset_inner = a_offset; | |||
| svbool_t pg = SV_WHILE(0L, remaining_n); | |||
| svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); | |||
| uint64_t active = remaining_n; | |||
| uint64_t i_cnt = m >> 2; | |||
| while (i_cnt--) { | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| @@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| @@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| @@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| @@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| #ifdef DOUBLE | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| @@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| js = 0; | |||
| #ifdef DOUBLE | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| @@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| #ifdef DOUBLE | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| @@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| js = 0; | |||
| #ifdef DOUBLE | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| @@ -121,11 +121,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -56,13 +56,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -106,11 +106,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -56,13 +57,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -24,7 +24,12 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #ifdef __NVCOMPILER | |||
| #define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) | |||
| #if (NVCOMPVERS < 2309) | |||
| #pragma opt 1 | |||
| #endif | |||
| #endif | |||
| #include "common.h" | |||
| @@ -239,8 +239,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_M1 | |||
| @@ -276,9 +274,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| add pB, pB, 64 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_M2 | |||
| @@ -313,11 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri z23.d, p1/m, z2.d, z15.d | |||
| ld1rd z15.d, p0/z, [pB, 56] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| add pB, pB, 64 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_E | |||
| @@ -340,11 +331,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ir z23.d, p1/m, z3.d, z14.d | |||
| OP_ii z22.d, p1/m, z3.d, z15.d | |||
| OP_ri z23.d, p1/m, z2.d, z15.d | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNELv1x4_SUB | |||
| @@ -382,14 +368,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ir z23.d, p1/m, z1.d, z14.d | |||
| OP_ii z22.d, p1/m, z1.d, z15.d | |||
| OP_ri z23.d, p1/m, z0.d, z15.d | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| .endm | |||
| .macro SAVEv1x4 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2d {z24.d, z25.d}, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaz_R | |||
| fmls z24.d, p1/m, z17.d, alphaz_I | |||
| @@ -407,7 +388,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| st2d {z26.d, z27.d}, p1, [pCRow1] | |||
| add pCRow1, pCRow1, lanes, lsl #4 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld2d {z28.d, z29.d}, p1/z, [pCRow2] | |||
| fmla z28.d, p1/m, z20.d, alphaz_R | |||
| @@ -425,12 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmla z31.d, p1/m, z23.d, alphaz_R | |||
| st2d {z30.d, z31.d}, p1, [pCRow3] | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -466,8 +442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVEv1x2 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2d {z24.d, z25.d}, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaz_R | |||
| fmls z24.d, p1/m, z17.d, alphaz_I | |||
| @@ -485,10 +459,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| st2d {z26.d, z27.d}, p1, [pCRow1] | |||
| add pCRow1, pCRow1, lanes, lsl #4 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -516,8 +486,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVEv1x1 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2d {z24.d, z25.d}, p1/z, [pCRow0] | |||
| fmla z24.d, p1/m, z16.d, alphaz_R | |||
| fmls z24.d, p1/m, z17.d, alphaz_I | |||
| @@ -527,8 +495,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -553,9 +519,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alphaR, d0 | |||
| dup alphaz_R, alphaR | |||
| fmov alphaI, d1 | |||
| @@ -676,10 +639,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| bne .Lzgemm_kernel_L4_Mv1_46 | |||
| .Lzgemm_kernel_L4_Mv1_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVEv1x4 | |||
| .Lzgemm_kernel_L4_Mv1_END: | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| @@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| aoffset += active * lda * 2; | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| boffset = b; | |||
| j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| do { | |||
| @@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| aoffset += active * 2; | |||
| j += svcntd(); | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| @@ -79,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | |||
| if (offset <= 0) { | |||
| svbool_t off_g = svwhilelt_b64(offset, 0LL); | |||
| svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL); | |||
| data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
| } | |||
| @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| @@ -117,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| int32_t j = 0; | |||
| int32_t N = n; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| @@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| gat_ind = svadd_m(cmp, gat_ind, lda_vec); | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); | |||
| if (offset <= 0) { | |||
| svbool_t off_g = svwhilelt_b32(offset, 0); | |||
| svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0); | |||
| data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
| } | |||
| @@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| @@ -80,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
| data_vec_imag = svneg_z(pg, data_vec_imag); | |||
| if (offset <= 0) { | |||
| svbool_t off_g = svwhilelt_b64(offset, 0LL); | |||
| svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL); | |||
| data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
| } | |||
| @@ -100,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| #else | |||
| @@ -116,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| int32_t j = 0; | |||
| int32_t N = n; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| @@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); | |||
| data_vec_imag = svneg_z(pg, data_vec_imag); | |||
| if (offset <= 0) { | |||
| svbool_t off_g = svwhilelt_b32(offset, 0); | |||
| svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0); | |||
| data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); | |||
| } | |||
| @@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| @@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| @@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| @@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| svint64_t one_vec = svdup_s64(1LL); | |||
| int64_t j = 0; | |||
| svbool_t pg = svwhilelt_b64(j, n); | |||
| svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| int64_t active = svcntp_b64(svptrue_b64(), pg); | |||
| svint64_t index_neg = svindex_s64(0LL, -1LL); | |||
| svint64_t index = svindex_s64(0LL, 1LL); | |||
| @@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s64(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b64(j, n); | |||
| pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); | |||
| active = svcntp_b64(svptrue_b64(), pg); | |||
| } while (svptest_any(svptrue_b64(), pg)); | |||
| @@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| int32_t N = n; | |||
| int32_t j = 0; | |||
| svbool_t pg = svwhilelt_b32(j, N); | |||
| svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| int32_t active = svcntp_b32(svptrue_b32(), pg); | |||
| svint32_t index_neg = svindex_s32(0, -1); | |||
| svint32_t index = svindex_s32(0, 1); | |||
| @@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posX += sve_size; | |||
| posX_vec = svdup_s32(posX); | |||
| j += sve_size; | |||
| pg = svwhilelt_b32(j, N); | |||
| pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); | |||
| active = svcntp_b32(svptrue_b32(), pg); | |||
| } while (svptest_any(svptrue_b32(), pg)); | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| #ifdef DOUBLE | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| @@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| js = 0; | |||
| #ifdef DOUBLE | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| @@ -129,11 +130,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| #ifdef DOUBLE | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| @@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| FLOAT *ao; | |||
| js = 0; | |||
| #ifdef DOUBLE | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| svbool_t pn = svwhilelt_b32(js, n); | |||
| svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do | |||
| @@ -128,11 +129,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||
| posY += n_active; | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, n); | |||
| pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svint64_t index = svindex_s64(0LL, lda); | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svint32_t index = svindex_s32(0, lda); | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -1,5 +1,6 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2023 The OpenBLAS Project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| @@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| jj = offset; | |||
| #ifdef DOUBLE | |||
| int64_t js = 0; | |||
| svbool_t pn = svwhilelt_b64(js, n); | |||
| svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| int n_active = svcntp_b64(svptrue_b64(), pn); | |||
| #else | |||
| int32_t N = n; | |||
| int32_t js = 0; | |||
| svbool_t pn = svwhilelt_b32(js, N); | |||
| svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| int n_active = svcntp_b32(svptrue_b32(), pn); | |||
| #endif | |||
| do { | |||
| @@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| js += n_active; | |||
| #ifdef DOUBLE | |||
| pn = svwhilelt_b64(js, n); | |||
| pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); | |||
| n_active = svcntp_b64(svptrue_b64(), pn); | |||
| } while (svptest_any(svptrue_b64(), pn)); | |||
| #else | |||
| pn = svwhilelt_b32(js, N); | |||
| pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); | |||
| n_active = svcntp_b32(svptrue_b32(), pn); | |||
| } while (svptest_any(svptrue_b32(), pn)); | |||
| #endif | |||
| @@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| BLASLONG i, ii, j, jj; | |||
| FLOAT data01, data02; | |||
| FLOAT data01=0.0, data02=0.0; | |||
| FLOAT *a1; | |||
| lda *= 2; | |||
| @@ -47,6 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT | |||
| FLOAT data05, data06, data07, data08; | |||
| FLOAT *a1, *a2; | |||
| data01=data02=data07=data08=0.0; | |||
| lda *= 2; | |||
| jj = offset; | |||
| @@ -1,3 +1,4 @@ | |||
| ifndef NO_LASX | |||
| DGEMMKERNEL = dgemm_kernel_16x4.S | |||
| DGEMMINCOPY = dgemm_ncopy_16.S | |||
| DGEMMITCOPY = dgemm_tcopy_16.S | |||
| @@ -8,7 +9,26 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMVNKERNEL = dgemv_n_8_lasx.S | |||
| DGEMVTKERNEL = dgemv_t_8_lasx.S | |||
| SGEMMKERNEL = sgemm_kernel_16x8_lasx.S | |||
| SGEMMINCOPY = sgemm_ncopy_16_lasx.S | |||
| SGEMMITCOPY = sgemm_tcopy_16_lasx.S | |||
| SGEMMONCOPY = sgemm_ncopy_8_lasx.S | |||
| SGEMMOTCOPY = sgemm_tcopy_8_lasx.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| @@ -132,12 +132,16 @@ CSWAPKERNEL = ../arm/zswap.c | |||
| ZSWAPKERNEL = ../arm/zswap.c | |||
| SGEMVNKERNEL = ../arm/gemv_n.c | |||
| ifndef DGEMVNKERNEL | |||
| DGEMVNKERNEL = ../arm/gemv_n.c | |||
| endif | |||
| CGEMVNKERNEL = ../arm/zgemv_n.c | |||
| ZGEMVNKERNEL = ../arm/zgemv_n.c | |||
| SGEMVTKERNEL = ../arm/gemv_t.c | |||
| ifndef DGEMVTKERNEL | |||
| DGEMVTKERNEL = ../arm/gemv_t.c | |||
| endif | |||
| CGEMVTKERNEL = ../arm/zgemv_t.c | |||
| ZGEMVTKERNEL = ../arm/zgemv_t.c | |||
| @@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmov.d s2, s1 | |||
| bge $r0, N, .L999 | |||
| slli.d INCX, INCX, ZBASE_SHIFT | |||
| bge $r0, INCX, .L999 | |||
| beq $r0, INCX, .L999 | |||
| srai.d I, N, 2 | |||
| bge $r0, I, .L25 | |||
| LD a1, X, 0 * SIZE | |||
| @@ -0,0 +1,546 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /********************************************************************* | |||
| * 2023/07/14 guxiwei | |||
| * UTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * | |||
| * | |||
| *********************************************************************/ | |||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, | |||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define ALPHA $f0 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INC_X $r10 | |||
| #define Y $r11 | |||
| #define INC_Y $r6 | |||
| #define J $r12 | |||
| #define I $r13 | |||
| #define K $r14 | |||
| #define Y_ORG $r15 | |||
| #define OFFSET $r16 | |||
| #define K_LDA $r17 | |||
| #define M8 $r18 | |||
| #define T0 $r19 | |||
| #define PA0 $r20 | |||
| #define PA1 $r23 | |||
| #define PA2 $r24 | |||
| #define PA3 $r25 | |||
| #define PA4 $r26 | |||
| #define PA5 $r27 | |||
| #define PA6 $r28 | |||
| #define PA7 $r29 | |||
| #define VALPHA $xr1 | |||
| #define X0 $xr2 | |||
| #define X1 $xr3 | |||
| #define X2 $xr4 | |||
| #define X3 $xr5 | |||
| #define X4 $xr6 | |||
| #define X5 $xr7 | |||
| #define X6 $xr8 | |||
| #define X7 $xr9 | |||
| #define Y0 $xr10 | |||
| #define Y1 $xr11 | |||
| #define A0 $xr12 | |||
| #define A1 $xr13 | |||
| #define A2 $xr14 | |||
| #define A3 $xr15 | |||
| #define A4 $xr16 | |||
| #define A5 $xr17 | |||
| #define A6 $xr18 | |||
| #define A7 $xr19 | |||
| #define A8 $xr20 | |||
| #define A9 $xr21 | |||
| #define A10 $xr22 | |||
| #define A11 $xr23 | |||
| #define A12 $xr24 | |||
| #define A13 $xr25 | |||
| #define A14 $xr26 | |||
| #define A15 $xr27 | |||
| .macro DLOAD_X_8 | |||
| GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ | |||
| X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 | |||
| GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ | |||
| X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA | |||
| .endm | |||
| .macro DLOAD_X_4 | |||
| GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 | |||
| GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA | |||
| .endm | |||
| .macro DLOAD_X_2 | |||
| GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08 | |||
| GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA | |||
| .endm | |||
| .macro DLOAD_X_1 | |||
| GLDREPL xv, d, X0, X, 0x00 | |||
| GMUL xvf, d, X0, X0, VALPHA | |||
| .endm | |||
| .macro DLOAD_Y_8 | |||
| GLD xv, , Y0, Y, 0, Y1, Y, 0x20 | |||
| .endm | |||
| .macro DLOAD_Y_4 | |||
| GLD xv, , Y0, Y, 0 | |||
| .endm | |||
| .macro DLOAD_Y_1 | |||
| fld.d $f10, Y, 0 | |||
| .endm | |||
| .macro DSTORE_Y_8 | |||
| GST xv, , Y0, Y, 0, Y1, Y, 0x20 | |||
| .endm | |||
| .macro DSTORE_Y_4 | |||
| GST xv, , Y0, Y, 0 | |||
| .endm | |||
| .macro DSTORE_Y_1 | |||
| fst.d $f10, Y, 0 | |||
| .endm | |||
| // Unable to use vector load/store ins | |||
| .macro DLOAD_Y_8_GAP | |||
| fld.d $f10, Y, 0 | |||
| fldx.d $f13, Y, INC_Y | |||
| PTR_ALSL T0, INC_Y, Y, 1 | |||
| fld.d $f14, T0, 0 | |||
| fldx.d $f15, T0, INC_Y | |||
| PTR_ALSL T0, INC_Y, Y, 2 | |||
| fld.d $f11, T0, 0 | |||
| fldx.d $f17, T0, INC_Y | |||
| PTR_ADD T0, T0, INC_Y | |||
| PTR_ADD T0, T0, INC_Y | |||
| fld.d $f18, T0, 0 | |||
| fldx.d $f19, T0, INC_Y | |||
| GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 | |||
| .endm | |||
| .macro DLOAD_Y_4_GAP | |||
| fld.d $f10, Y, 0 | |||
| fldx.d $f13, Y, INC_Y | |||
| PTR_ALSL T0, INC_Y, Y, 1 | |||
| fld.d $f14, T0, 0 | |||
| fldx.d $f15, T0, INC_Y | |||
| GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3 | |||
| .endm | |||
| .macro DSTORE_Y_8_GAP | |||
| xvstelm.d Y0, Y, 0, 0 | |||
| PTR_ADD T0, Y, INC_Y | |||
| xvstelm.d Y0, T0, 0, 1 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y0, T0, 0, 2 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y0, T0, 0, 3 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y1, T0, 0, 0 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y1, T0, 0, 1 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y1, T0, 0, 2 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y1, T0, 0, 3 | |||
| .endm | |||
| .macro DSTORE_Y_4_GAP | |||
| xvstelm.d Y0, Y, 0, 0 | |||
| PTR_ADD T0, Y, INC_Y | |||
| xvstelm.d Y0, T0, 0, 1 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y0, T0, 0, 2 | |||
| PTR_ADD T0, T0, INC_Y | |||
| xvstelm.d Y0, T0, 0, 3 | |||
| .endm | |||
| .macro DLOAD_X_8_GAP | |||
| xvldrepl.d X0, X, 0x00 | |||
| PTR_ADD T0, X, INC_X | |||
| xvldrepl.d X1, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X2, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X3, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X4, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X5, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X6, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X7, T0, 0x00 | |||
| GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ | |||
| X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA | |||
| .endm | |||
| .macro DLOAD_X_4_GAP | |||
| xvldrepl.d X0, X, 0x00 | |||
| PTR_ADD T0, X, INC_X | |||
| xvldrepl.d X1, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X2, T0, 0x00 | |||
| PTR_ADD T0, T0, INC_X | |||
| xvldrepl.d X3, T0, 0x00 | |||
| GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA | |||
| .endm | |||
| .macro DLOAD_X_2_GAP | |||
| xvldrepl.d X0, X, 0x00 | |||
| PTR_ADD T0, X, INC_X | |||
| xvldrepl.d X1, T0, 0x00 | |||
| GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA | |||
| .endm | |||
| .macro DGEMV_N_8x8 | |||
| GLD_INC xv, , 0x20, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0, \ | |||
| A4, PA2, 0, A5, PA2, 0, \ | |||
| A6, PA3, 0, A7, PA3, 0, \ | |||
| A8, PA4, 0, A9, PA4, 0, \ | |||
| A10, PA5, 0, A11, PA5, 0, \ | |||
| A12, PA6, 0, A13, PA6, 0, \ | |||
| A14, PA7, 0, A15, PA7, 0 | |||
| GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ | |||
| Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ | |||
| Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ | |||
| Y0, A6, X3, Y0, Y1, A7, X3, Y1, \ | |||
| Y0, A8, X4, Y0, Y1, A9, X4, Y1, \ | |||
| Y0, A10, X5, Y0, Y1, A11, X5, Y1, \ | |||
| Y0, A12, X6, Y0, Y1, A13, X6, Y1, \ | |||
| Y0, A14, X7, Y0, Y1, A15, X7, Y1 | |||
| .endm | |||
| .macro DGEMV_N_4x8 | |||
| GLD_INC xv, , 0x20, A0, PA0, 0, \ | |||
| A2, PA1, 0, \ | |||
| A4, PA2, 0, \ | |||
| A6, PA3, 0, \ | |||
| A8, PA4, 0, \ | |||
| A10, PA5, 0, \ | |||
| A12, PA6, 0, \ | |||
| A14, PA7, 0 | |||
| GMADD xvf, d, Y0, A0, X0, Y0, \ | |||
| Y0, A2, X1, Y0, \ | |||
| Y0, A4, X2, Y0, \ | |||
| Y0, A6, X3, Y0, \ | |||
| Y0, A8, X4, Y0, \ | |||
| Y0, A10, X5, Y0, \ | |||
| Y0, A12, X6, Y0, \ | |||
| Y0, A14, X7, Y0 | |||
| .endm | |||
| .macro DGEMV_N_1x8 | |||
| GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ | |||
| $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 | |||
| GMADD f, d, $f10, $f12, $f2, $f10, \ | |||
| $f10, $f14, $f3, $f10, \ | |||
| $f10, $f16, $f4, $f10, \ | |||
| $f10, $f18, $f5, $f10, \ | |||
| $f10, $f20, $f6, $f10, \ | |||
| $f10, $f22, $f7, $f10, \ | |||
| $f10, $f24, $f8, $f10, \ | |||
| $f10, $f26, $f9, $f10, | |||
| .endm | |||
| .macro DGEMV_N_8x4 | |||
| GLD_INC xv, , 0x20, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0, \ | |||
| A4, PA2, 0, A5, PA2, 0, \ | |||
| A6, PA3, 0, A7, PA3, 0 | |||
| GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ | |||
| Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ | |||
| Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ | |||
| Y0, A6, X3, Y0, Y1, A7, X3, Y1 | |||
| .endm | |||
| .macro DGEMV_N_4x4 | |||
| GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 | |||
| GMADD xvf, d, Y0, A0, X0, Y0, Y0, A2, X1, Y0, \ | |||
| Y0, A4, X2, Y0, Y0, A6, X3, Y0 | |||
| .endm | |||
| .macro DGEMV_N_1x4 | |||
| GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 | |||
| GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \ | |||
| $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10 | |||
| .endm | |||
| .macro DGEMV_N_8x2 | |||
| GLD_INC xv, , 0x20, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0 | |||
| GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ | |||
| Y0, A2, X1, Y0, Y1, A3, X1, Y1 | |||
| .endm | |||
| .macro DGEMV_N_4x2 | |||
| GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 | |||
| GMADD xvf, d, Y0, A0, X0, Y0, \ | |||
| Y0, A2, X1, Y0 | |||
| .endm | |||
| .macro DGEMV_N_1x2 | |||
| GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0 | |||
| GMADD f, d, $f10, $f12, $f2, $f10, \ | |||
| $f10, $f14, $f3, $f10 | |||
| .endm | |||
| .macro DGEMV_N_1x1 | |||
| fld.d $f12, PA0, 0 | |||
| PTR_ADDI PA0, PA0, 0x08 | |||
| fmadd.d $f10, $f12, $f2, $f10 | |||
| .endm | |||
| .macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req | |||
| PTR_SRLI J, N, 3 | |||
| beqz J, .L_\XW\()_N_7 | |||
| PTR_SLLI K_LDA, LDA, 3 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| .L_\XW\()_N_L8: | |||
| DLOAD_\X_8 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| PTR_SRLI I, M, 3 | |||
| beqz I, .L_\XW\()_M_7 | |||
| .align 5 | |||
| .L_\XW\()_M_L8: | |||
| DLOAD_\Y_8 | |||
| DGEMV_N_8x8 | |||
| DSTORE_\Y_8 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL Y, INC_Y, Y, 3 | |||
| PTR_ADDI K, K, 8 | |||
| bnez I, .L_\XW\()_M_L8 | |||
| .L_\XW\()_M_7: | |||
| andi I, M, 4 | |||
| beqz I, .L_\XW\()_M_3 | |||
| DLOAD_\Y_4 | |||
| DGEMV_N_4x8 | |||
| DSTORE_\Y_4 | |||
| PTR_ALSL Y, INC_Y, Y, 2 | |||
| PTR_ADDI K, K, 4 | |||
| .L_\XW\()_M_3: | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_M_END | |||
| .align 5 | |||
| .L_\XW\()_M_L1: | |||
| DLOAD_\Y_1 | |||
| DGEMV_N_1x8 | |||
| DSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_M_L1 | |||
| .L_\XW\()_M_END: | |||
| PTR_ADDI J, J, -1 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
| PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
| #else | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
| PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
| #endif | |||
| PTR_ALSL X, INC_X, X, 3 | |||
| bnez J, .L_\XW\()_N_L8 | |||
| .L_\XW\()_N_7: | |||
| andi J, N, 4 | |||
| beqz J, .L_\XW\()_N_3 | |||
| DLOAD_\X_4 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| PTR_SRLI I, M, 3 | |||
| beqz I, .L_\XW\()_N_4_M_7 | |||
| .align 5 | |||
| .L_\XW\()_N_4_M_L8: | |||
| DLOAD_\Y_8 | |||
| DGEMV_N_8x4 | |||
| DSTORE_\Y_8 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADDI K, K, 8 | |||
| PTR_ALSL Y, INC_Y, Y, 3 | |||
| bnez I, .L_\XW\()_N_4_M_L8 | |||
| .L_\XW\()_N_4_M_7: | |||
| andi I, M, 4 | |||
| beqz I, .L_\XW\()_N_4_M_3 | |||
| DLOAD_\Y_4 | |||
| DGEMV_N_4x4 | |||
| DSTORE_\Y_4 | |||
| PTR_ALSL Y, INC_Y, Y, 2 | |||
| PTR_ADDI K, K, 4 | |||
| .L_\XW\()_N_4_M_3: | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_N_4_M_END | |||
| .align 5 | |||
| .L_\XW\()_N_4_M_L1: | |||
| DLOAD_\Y_1 | |||
| DGEMV_N_1x4 | |||
| DSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_N_4_M_L1 | |||
| .L_\XW\()_N_4_M_END: | |||
| PTR_SLLI K_LDA, LDA, 2 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #else | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #endif | |||
| PTR_ALSL X, INC_X, X, 2 | |||
| .L_\XW\()_N_3: | |||
| andi J, N, 2 | |||
| beqz J, .L_\XW\()_N_1 | |||
| DLOAD_\X_2 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| PTR_SRLI I, M, 3 | |||
| beqz I, .L_\XW\()_N_2_M_7 | |||
| .align 5 | |||
| .L_\XW\()_N_2_M_L8: | |||
| DLOAD_\Y_8 | |||
| DGEMV_N_8x2 | |||
| DSTORE_\Y_8 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADDI K, K, 8 | |||
| PTR_ALSL Y, INC_Y, Y, 3 | |||
| bnez I, .L_\XW\()_N_2_M_L8 | |||
| .L_\XW\()_N_2_M_7: | |||
| andi I, M, 4 | |||
| beqz I, .L_\XW\()_N_2_M_3 | |||
| DLOAD_\Y_4 | |||
| DGEMV_N_4x2 | |||
| DSTORE_\Y_4 | |||
| PTR_ALSL Y, INC_Y, Y, 2 | |||
| PTR_ADDI K, K, 4 | |||
| .L_\XW\()_N_2_M_3: | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_N_2_M_END | |||
| .align 5 | |||
| .L_\XW\()_N_2_M_L1: | |||
| DLOAD_\Y_1 | |||
| DGEMV_N_1x2 | |||
| DSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_N_2_M_L1 | |||
| .L_\XW\()_N_2_M_END: | |||
| PTR_SLLI K_LDA, LDA, 1 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| PTR_ADD PA0, PA0, K_LDA | |||
| PTR_ADD PA1, PA1, K_LDA | |||
| PTR_ALSL X, INC_X, X, 1 | |||
| .L_\XW\()_N_1: | |||
| andi J, N, 1 | |||
| beqz J, .L_END | |||
| DLOAD_\X_1 | |||
| xor K, K, K | |||
| move Y, Y_ORG | |||
| move I, M | |||
| beqz I, .L_END | |||
| .align 5 | |||
| .L_\XW\()_N_1_M_L1: | |||
| DLOAD_\Y_1 | |||
| DGEMV_N_1x1 | |||
| DSTORE_\Y_1 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD Y, Y, INC_Y | |||
| PTR_ADDI K, K, 1 | |||
| bnez I, .L_\XW\()_N_1_M_L1 | |||
| b .L_END | |||
| .endm | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 7, 24 + 4 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| PTR_SUB J, INC_Y, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ | |||
| PTR_ALSL I, I, J, 1 | |||
| GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | |||
| xvreplve0.d VALPHA, $xr0 | |||
| move Y_ORG, Y | |||
| move PA0, A | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
| PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
| #else | |||
| GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
| PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
| #endif | |||
| la.local T0, .L_GAP_TABLE | |||
| PTR_ALSL I, I, T0, 1 | |||
| ld.h K, I, 0 | |||
| PTR_ADD T0, T0, K | |||
| jirl $r0, T0, 0 | |||
| .L_GAP_TABLE: | |||
| .hword .L_GAP_0_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_0_1 - .L_GAP_TABLE | |||
| .hword .L_GAP_1_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_1_1 - .L_GAP_TABLE | |||
| .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ | |||
| DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 | |||
| .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ | |||
| DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 | |||
| .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ | |||
| DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 | |||
| .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ | |||
| DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 | |||
| .L_END: | |||
| pop_if_used 17 + 7, 24 + 4 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -0,0 +1,468 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /********************************************************************* | |||
| * 2023/07/17 guxiwei | |||
| * UTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * | |||
| * | |||
| *********************************************************************/ | |||
| /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, | |||
| * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| */ | |||
| #define M $r4 | |||
| #define N $r5 | |||
| #define ALPHA $f0 | |||
| #define A $r7 | |||
| #define LDA $r8 | |||
| #define X $r9 | |||
| #define INC_X $r10 | |||
| #define Y $r11 | |||
| #define INC_Y $r6 | |||
| #define J $r12 | |||
| #define I $r13 | |||
| #define K $r14 | |||
| #define PY0 $r14 | |||
| #define X_ORG $r15 | |||
| #define PY1 $r16 | |||
| #define K_LDA $r17 | |||
| #define PY2 $r18 | |||
| #define T0 $r19 | |||
| #define PA0 $r20 | |||
| #define PA1 $r23 | |||
| #define PA2 $r24 | |||
| #define PA3 $r25 | |||
| #define PA4 $r26 | |||
| #define PA5 $r27 | |||
| #define PA6 $r28 | |||
| #define PA7 $r29 | |||
| #define M8 $r30 | |||
| #define VALPHA $xr0 | |||
| #define X0 $xr1 | |||
| #define X1 $xr2 | |||
| #define A0 $xr3 | |||
| #define A1 $xr4 | |||
| #define A2 $xr5 | |||
| #define A3 $xr6 | |||
| #define A4 $xr7 | |||
| #define A5 $xr8 | |||
| #define A6 $xr9 | |||
| #define A7 $xr10 | |||
| #define A8 $xr11 | |||
| #define A9 $xr12 | |||
| #define A10 $xr13 | |||
| #define A11 $xr14 | |||
| #define A12 $xr15 | |||
| #define A13 $xr16 | |||
| #define A14 $xr17 | |||
| #define A15 $xr18 | |||
| #define TP0 $xr19 | |||
| #define TP1 $xr20 | |||
| #define TP2 $xr21 | |||
| #define TP3 $xr22 | |||
| #define TP4 $xr23 | |||
| #define TP5 $xr24 | |||
| #define TP6 $xr25 | |||
| #define TP7 $xr26 | |||
| #define Y0 $xr3 | |||
| #define Y1 $xr4 | |||
| #define Y2 $xr5 | |||
| #define Y3 $xr6 | |||
| #define Y4 $xr7 | |||
| #define Y5 $xr8 | |||
| #define Y6 $xr9 | |||
| #define Y7 $xr10 | |||
| .macro ZERO_Y8 | |||
| GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ | |||
| TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 | |||
| .endm | |||
| .macro ZERO_Y4 | |||
| GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 | |||
| .endm | |||
| .macro ZERO_Y2 | |||
| GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1 | |||
| .endm | |||
| .macro ZERO_Y1 | |||
| GXOR xv, v, TP0, TP0, TP0 | |||
| .endm | |||
| .macro DLOAD_X8 | |||
| GLD xv, , X0, X, 0x00, X1, X, 0x20 | |||
| .endm | |||
| .macro DLOAD_X4 | |||
| GLD xv, , X0, X, 0x00 | |||
| .endm | |||
| .macro DLOAD_X8_GAP | |||
| fld.d $f1, X, 0x00 | |||
| fldx.d $f2, X, INC_X | |||
| PTR_ALSL T0, INC_X, X, 1 | |||
| fld.d $f3, T0, 0x00 | |||
| fldx.d $f4, T0, INC_X | |||
| GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 | |||
| PTR_ALSL T0, INC_X, X, 2 | |||
| fld.d $f2, T0, 0x00 | |||
| fldx.d $f3, T0, INC_X | |||
| PTR_ALSL T0, INC_X, T0, 1 | |||
| fld.d $f4, T0, 0x00 | |||
| fldx.d $f5, T0, INC_X | |||
| GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3 | |||
| .endm | |||
| .macro DLOAD_X4_GAP | |||
| fld.d $f1, X, 0x00 | |||
| fldx.d $f2, X, INC_X | |||
| PTR_ALSL T0, INC_X, X, 1 | |||
| fld.d $f3, T0, 0x00 | |||
| fldx.d $f4, T0, INC_X | |||
| GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 | |||
| .endm | |||
| .macro DGEMV_T_8x8 | |||
| GLD_INC xv, , 0x20, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0, \ | |||
| A4, PA2, 0, A5, PA2, 0, \ | |||
| A6, PA3, 0, A7, PA3, 0, \ | |||
| A8, PA4, 0, A9, PA4, 0, \ | |||
| A10, PA5, 0, A11, PA5, 0, \ | |||
| A12, PA6, 0, A13, PA6, 0, \ | |||
| A14, PA7, 0, A15, PA7, 0 | |||
| GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ | |||
| TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ | |||
| TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ | |||
| TP3, A6, X0, TP3, TP3, A7, X1, TP3, \ | |||
| TP4, A8, X0, TP4, TP4, A9, X1, TP4, \ | |||
| TP5, A10, X0, TP5, TP5, A11, X1, TP5, \ | |||
| TP6, A12, X0, TP6, TP6, A13, X1, TP6, \ | |||
| TP7, A14, X0, TP7, TP7, A15, X1, TP7 | |||
| .endm | |||
| .macro DGEMV_T_8x4 | |||
| GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0, \ | |||
| A8, PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0 | |||
| GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ | |||
| TP2, A4, X0, TP2, TP3, A6, X0, TP3, \ | |||
| TP4, A8, X0, TP4, TP5, A10, X0, TP5, \ | |||
| TP6, A12, X0, TP6, TP7, A14, X0, TP7, | |||
| .endm | |||
| .macro DGEMV_T_4x8 | |||
| GLD_INC xv, , 0x20, \ | |||
| A0, PA0, 0, A1, PA0, 0, \ | |||
| A2, PA1, 0, A3, PA1, 0, \ | |||
| A4, PA2, 0, A5, PA2, 0, \ | |||
| A6, PA3, 0, A7, PA3, 0 | |||
| GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ | |||
| TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ | |||
| TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ | |||
| TP3, A6, X0, TP3, TP3, A7, X1, TP3 | |||
| .endm | |||
| .macro DGEMV_T_4x4 | |||
| GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 | |||
| GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ | |||
| TP2, A4, X0, TP2, TP3, A6, X0, TP3 | |||
| .endm | |||
| .macro DGEMV_T_2x8 | |||
| GLD_INC xv, , 0x20, A0, PA0, 0, A1, PA0, 0, A2, PA1, 0, A3, PA1, 0 | |||
| GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ | |||
| TP1, A2, X0, TP1, TP1, A3, X1, TP1 | |||
| .endm | |||
| .macro DGEMV_T_2x4 | |||
| GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 | |||
| GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1 | |||
| .endm | |||
| .macro DGEMV_T XW:req X8:req, X4:req | |||
| PTR_SRLI J, N, 3 | |||
| beqz J, .L_\XW\()_N_7 | |||
| PTR_SLLI K_LDA, LDA, 3 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| .L_\XW\()_N_L8: | |||
| ZERO_Y8 | |||
| move X, X_ORG | |||
| PTR_SRLI I, M, 3 | |||
| beqz I, .L_\XW\()_M_7 | |||
| .align 5 | |||
| .L_\XW\()_M_L8: | |||
| DLOAD_\X8 | |||
| DGEMV_T_8x8 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL X, INC_X, X, 3 | |||
| bnez I, .L_\XW\()_M_L8 | |||
| .L_\XW\()_M_7: | |||
| andi I, M, 4 | |||
| beqz I, .L_\XW\()_M_3 | |||
| DLOAD_\X4 | |||
| DGEMV_T_8x4 | |||
| PTR_ALSL X, INC_X, X, 2 | |||
| .L_\XW\()_M_3: | |||
| // Accumulated | |||
| GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ | |||
| Y5, TP5, Y6, TP6, Y7, TP7 | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_M_END | |||
| .align 5 | |||
| .L_\XW\()_M_L1: | |||
| fld.d $f1, X, 0x00 | |||
| fld.d $f11, PA0, 0x00 | |||
| fld.d $f12, PA1, 0x00 | |||
| fld.d $f13, PA2, 0x00 | |||
| fld.d $f14, PA3, 0x00 | |||
| fld.d $f15, PA4, 0x00 | |||
| fld.d $f16, PA5, 0x00 | |||
| fld.d $f17, PA6, 0x00 | |||
| fld.d $f18, PA7, 0x00 | |||
| #if __loongarch_grlen == 64 | |||
| GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ | |||
| PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 | |||
| #else | |||
| GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ | |||
| PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 | |||
| #endif | |||
| GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \ | |||
| $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| bnez I, .L_\XW\()_M_L1 | |||
| .L_\XW\()_M_END: | |||
| fld.d $f11, Y, 0x00 | |||
| fldx.d $f12, Y, INC_Y | |||
| PTR_ALSL PY0, INC_Y, Y, 1 | |||
| fld.d $f13, PY0, 0x00 | |||
| fldx.d $f14, PY0, INC_Y | |||
| PTR_ALSL PY1, INC_Y, Y, 2 | |||
| fld.d $f15, PY1, 0x00 | |||
| fldx.d $f16, PY1, INC_Y | |||
| PTR_ALSL PY2, INC_Y, PY1, 1 | |||
| fld.d $f17, PY2, 0x00 | |||
| fldx.d $f18, PY2, INC_Y | |||
| GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \ | |||
| $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18 | |||
| PTR_ADDI J, J, -1 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
| PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
| #else | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ | |||
| PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA | |||
| #endif | |||
| fst.d $f11, Y, 0x00 | |||
| fstx.d $f12, Y, INC_Y | |||
| fst.d $f13, PY0, 0x00 | |||
| fstx.d $f14, PY0, INC_Y | |||
| fst.d $f15, PY1, 0x00 | |||
| fstx.d $f16, PY1, INC_Y | |||
| fst.d $f17, PY2, 0x00 | |||
| fstx.d $f18, PY2, INC_Y | |||
| PTR_ALSL Y, INC_Y, Y, 3 | |||
| bnez J, .L_\XW\()_N_L8 | |||
| .L_\XW\()_N_7: | |||
| andi J, N, 4 | |||
| beqz J, .L_\XW\()_N_3 | |||
| ZERO_Y4 | |||
| move X, X_ORG | |||
| PTR_SRLI I, M, 3 | |||
| beqz I, .L_\XW\()_N_4_M_7 | |||
| .align 5 | |||
| .L_\XW\()_N_4_M_L8: | |||
| DLOAD_\X8 | |||
| DGEMV_T_4x8 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL X, INC_X, X, 3 | |||
| bnez I, .L_\XW\()_N_4_M_L8 | |||
| .L_\XW\()_N_4_M_7: | |||
| andi I, M, 4 | |||
| beqz I, .L_\XW\()_N_4_M_3 | |||
| DLOAD_\X4 | |||
| DGEMV_T_4x4 | |||
| PTR_ALSL X, INC_X, X, 2 | |||
| .L_\XW\()_N_4_M_3: | |||
| // Accumulated | |||
| GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_N_4_M_END | |||
| .align 5 | |||
| .L_\XW\()_N_4_M_L1: | |||
| fld.d $f1, X, 0x00 | |||
| GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00 | |||
| GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| bnez I, .L_\XW\()_N_4_M_L1 | |||
| .L_\XW\()_N_4_M_END: | |||
| fld.d $f11, Y, 0x00 | |||
| fldx.d $f12, Y, INC_Y | |||
| PTR_ALSL PY0, INC_Y, Y, 1 | |||
| fld.d $f13, PY0, 0x00 | |||
| fldx.d $f14, PY0, INC_Y | |||
| GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14 | |||
| PTR_SLLI K_LDA, LDA, 2 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #else | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA | |||
| #endif | |||
| fst.d $f11, Y, 0x00 | |||
| fstx.d $f12, Y, INC_Y | |||
| fst.d $f13, PY0, 0x00 | |||
| fstx.d $f14, PY0, INC_Y | |||
| PTR_ALSL Y, INC_Y, Y, 2 | |||
| .L_\XW\()_N_3: | |||
| andi J, N, 2 | |||
| beqz J, .L_\XW\()_N_1 | |||
| ZERO_Y2 | |||
| move X, X_ORG | |||
| PTR_SRLI I, M, 3 | |||
| beqz I, .L_\XW\()_N_2_M_7 | |||
| .align 5 | |||
| .L_\XW\()_N_2_M_L8: | |||
| DLOAD_\X8 | |||
| DGEMV_T_2x8 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ALSL X, INC_X, X, 3 | |||
| bnez I, .L_\XW\()_N_2_M_L8 | |||
| .L_\XW\()_N_2_M_7: | |||
| andi I, M, 4 | |||
| beqz I, .L_\XW\()_N_2_M_3 | |||
| DLOAD_\X4 | |||
| DGEMV_T_2x4 | |||
| PTR_ALSL X, INC_X, X, 2 | |||
| .L_\XW\()_N_2_M_3: | |||
| // Accumulated | |||
| GACC xvf, d, Y0, TP0, Y1, TP1 | |||
| andi I, M, 3 | |||
| beqz I, .L_\XW\()_N_2_M_END | |||
| .align 5 | |||
| .L_\XW\()_N_2_M_L1: | |||
| fld.d $f1, X, 0x00 | |||
| GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00 | |||
| GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| bnez I, .L_\XW\()_N_2_M_L1 | |||
| .L_\XW\()_N_2_M_END: | |||
| fld.d $f11, Y, 0x00 | |||
| fldx.d $f12, Y, INC_Y | |||
| GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12 | |||
| PTR_SLLI K_LDA, LDA, 1 | |||
| PTR_SUB K_LDA, K_LDA, M8 | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #else | |||
| GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA | |||
| #endif | |||
| fst.d $f11, Y, 0x00 | |||
| fstx.d $f12, Y, INC_Y | |||
| PTR_ALSL Y, INC_Y, Y, 1 | |||
| .L_\XW\()_N_1: | |||
| andi J, N, 1 | |||
| beqz J, .L_END | |||
| ZERO_Y1 | |||
| move X, X_ORG | |||
| move I, M | |||
| beqz I, .L_END | |||
| .align 5 | |||
| .L_\XW\()_N_1_M_L1: | |||
| fld.d $f3, PA0, 0x00 | |||
| fld.d $f1, X, 0x00 | |||
| fmadd.d $f19, $f3, $f1, $f19 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD X, X, INC_X | |||
| PTR_ADDI PA0, PA0, 0x08 | |||
| bnez I, .L_\XW\()_N_1_M_L1 | |||
| fld.d $f3, Y, 0x00 | |||
| fmadd.d $f3, ALPHA, $f19, $f3 | |||
| fst.d $f3, Y, 0x00 | |||
| b .L_END | |||
| .endm | |||
| PROLOGUE | |||
| PTR_LD INC_Y, $sp, 0 | |||
| push_if_used 17 + 8, 24 + 3 | |||
| PTR_ADDI K, $r0, 0x01 | |||
| PTR_SUB I, INC_X, K | |||
| maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ | |||
| GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 | |||
| xvreplve0.d VALPHA, $xr0 | |||
| move X_ORG, X | |||
| move PA0, A | |||
| #if __loongarch_grlen == 64 | |||
| GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
| PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
| #else | |||
| GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ | |||
| PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA | |||
| #endif | |||
| la.local T0, .L_GAP_TABLE | |||
| PTR_ALSL I, I, T0, 1 | |||
| ld.h K, I, 0 | |||
| PTR_ADD T0, T0, K | |||
| jirl $r0, T0, 0 | |||
| .L_GAP_TABLE: | |||
| .hword .L_GAP_0 - .L_GAP_TABLE | |||
| .hword .L_GAP_1 - .L_GAP_TABLE | |||
| .L_GAP_0: /* if (incx == 1) */ | |||
| DGEMV_T GAP_0, X8, X4 | |||
| .L_GAP_1: /* if (incx != 1) */ | |||
| DGEMV_T GAP_1, X8_GAP, X4_GAP | |||
| .L_END: | |||
| pop_if_used 17 + 8, 24 + 3 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MTC s1, $r0 | |||
| bge $r0, N, .L999 | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| bge $r0, INCX, .L999 | |||
| beq $r0, INCX, .L999 | |||
| move XX, X | |||
| NOP | |||
| LD a1, X, 0 * SIZE | |||
| @@ -0,0 +1,407 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #if __loongarch_grlen == 64 | |||
| #define LA_REG int64_t | |||
| #define REG_SIZE 8 | |||
| #define REG_LOG 3 | |||
| #define PTR_ADDI addi.d | |||
| #define PTR_ADD add.d | |||
| #define PTR_SUB sub.d | |||
| #define PTR_LD ld.d | |||
| #define PTR_ST st.d | |||
| #define PTR_SLLI slli.d | |||
| #define PTR_SRLI srli.d | |||
| #define PTR_SRAI srai.d | |||
| #define PTR_MUL mul.d | |||
| #define PTR_ALSL alsl.d | |||
| #else | |||
| #define LA_REG int32_t | |||
| #define REG_SIZE 4 | |||
| #define REG_LOG 2 | |||
| #define PTR_ADDI addi.w | |||
| #define PTR_ADD add.w | |||
| #define PTR_SUB sub.w | |||
| #define PTR_LD ld.w | |||
| #define PTR_ST st.w | |||
| #define PTR_SLLI slli.w | |||
| #define PTR_SRLI srli.w | |||
| #define PTR_SRAI srai.w | |||
| #define PTR_MUL mul.w | |||
| #define PTR_ALSL alsl.w | |||
| #endif | |||
| #if __loongarch_frlen == 64 | |||
| #define FREG_SIZE 8 | |||
| #define FREG_LOG 3 | |||
| #define PTR_FLD fld.d | |||
| #define PTR_FST fst.d | |||
| #else | |||
| #define FREG_SIZE 4 | |||
| #define FREG_LOG 2 | |||
| #define PTR_FLD fld.s | |||
| #define PTR_FST fst.s | |||
| #endif | |||
| // The max registers available to the user which | |||
| // do not need to be preserved across calls. | |||
| // Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html | |||
| #define MAX_INT_CALLER_SAVED 17 | |||
| #define MAX_FP_CALLER_SAVED 24 | |||
| .altmacro // Enable alternate macro mode | |||
| .macro push_if_used regs, fregs | |||
| .if \regs > MAX_INT_CALLER_SAVED | |||
| PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) | |||
| push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 | |||
| .endif | |||
| .if \fregs > MAX_FP_CALLER_SAVED | |||
| PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) | |||
| push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 | |||
| .endif | |||
| .endm // End push_if_used | |||
| .macro pop_if_used regs, fregs | |||
| .if \fregs > MAX_FP_CALLER_SAVED | |||
| pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 | |||
| PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG | |||
| .endif | |||
| .if \regs > MAX_INT_CALLER_SAVED | |||
| pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 | |||
| PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG | |||
| .endif | |||
| .endm // End pop_if_used | |||
| .macro push_regs from, to | |||
| PTR_ST $s\()\from, $sp, \from << REG_LOG | |||
| .if \to - \from | |||
| push_regs %from + 1, \to | |||
| .endif | |||
| .endm // End push_regs | |||
| .macro pop_regs from, to | |||
| PTR_LD $s\()\from, $sp, \from << REG_LOG | |||
| .if \to - \from | |||
| pop_regs %from + 1, \to | |||
| .endif | |||
| .endm // End pop_regs | |||
| .macro push_fregs from, to | |||
| PTR_FST $fs\()\from, $sp, \from << FREG_LOG | |||
| .if \to - \from | |||
| push_fregs %from + 1, \to | |||
| .endif | |||
| .endm // End push_fregs | |||
| .macro pop_fregs from, to | |||
| PTR_FLD $fs\()\from, $sp, \from << FREG_LOG | |||
| .if \to - \from | |||
| pop_fregs %from + 1, \to | |||
| .endif | |||
| .endm // End pop_fregs | |||
| // | |||
| // Instruction Related Macros | |||
| // | |||
| // GLD | |||
| // | |||
| .macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg | |||
| .ifeqs "\suf_op", "0" | |||
| \pre_op\()ld \out, \src, \offset | |||
| .else | |||
| \pre_op\()ld.\suf_op \out, \src, \offset | |||
| .endif | |||
| .ifnb \more | |||
| GLD \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GLD_INC | |||
| // | |||
| .macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg | |||
| .ifeqs "\suf_op", "0" | |||
| \pre_op\()ld \out, \src, \offset | |||
| .else | |||
| \pre_op\()ld.\suf_op \out, \src, \offset | |||
| .endif | |||
| PTR_ADDI \src, \src, \inc | |||
| .ifnb \more | |||
| GLD_INC \pre_op, \suf_op, \inc, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GLDX is same as GLD except the stride is a register | |||
| // | |||
| .macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg | |||
| .ifeqs "\suf_op", "0" | |||
| \pre_op\()ldx \out, \src, \offset | |||
| .else | |||
| \pre_op\()ldx.\suf_op \out, \src, \offset | |||
| .endif | |||
| .ifnb \more | |||
| GLDX \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GLDREPL | |||
| // | |||
| .macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg | |||
| \pre_op\()ldrepl.\suf_op \out, \src, \offset | |||
| .ifnb \more | |||
| GLDREPL \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GST | |||
| // | |||
| .macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg | |||
| .ifeqs "\suf_op", "0" | |||
| \pre_op\()st \src, \dst, \offset | |||
| .else | |||
| \pre_op\()st.\suf_op \src, \dst, \offset | |||
| .endif | |||
| .ifnb \more | |||
| GST \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GMUL | |||
| // | |||
| .macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()mul.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GMUL \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GMADD | |||
| // | |||
| .macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg | |||
| \pre_op\()madd.\suf_op \out, \in0, \in1, \in2 | |||
| .ifnb \more | |||
| GMADD \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GADD | |||
| // | |||
| .macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()add.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GADD \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GADDI | |||
| // | |||
| .macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()addi.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GADDI \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GSUB | |||
| // | |||
| .macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()sub.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GSUB \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GSLLI | |||
| // | |||
| .macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()slli.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GSLLI \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GINSVE0 | |||
| // | |||
| .macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()insve0.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GINSVE0 \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GXOR | |||
| // | |||
| .macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()xor.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GXOR \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GPERMI | |||
| // | |||
| .macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg | |||
| \pre_op\()permi.\suf_op \out, \in0, \in1 | |||
| .ifnb \more | |||
| GPERMI \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GNMSUB | |||
| // | |||
| .macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg | |||
| \pre_op\()nmsub.\suf_op \out, \in0, \in1, \in2 | |||
| .ifnb \more | |||
| GNMSUB \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GPRELD | |||
| // | |||
| .macro GPRELD in0:req, in1:req, in2:req, more:vararg | |||
| preld \in0, \in1, \in2 | |||
| .ifnb \more | |||
| GPRELD \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // Compound instructions | |||
| // | |||
| // GACC: Accumulate the values of vector registers | |||
| // | |||
| .macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg | |||
| .ifeqs "\pre_op", "xvf" | |||
| xvpermi.q \out, \in, 0x01 | |||
| \pre_op\()add.\suf_op \in, \out, \in | |||
| xvpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifeqs "\suf_op", "s" | |||
| xvpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .endif | |||
| .ifeqs "\pre_op", "vf" | |||
| vpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifeqs "\suf_op", "s" | |||
| vpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .endif | |||
| .ifeqs "\pre_op", "xv" | |||
| xvpermi.q \out, \in, 0x01 | |||
| \pre_op\()add.\suf_op \in, \out, \in | |||
| xvpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "d" | |||
| xvpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "w" | |||
| xvpackod.h \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "h" | |||
| xvpackod.b \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .endif | |||
| .endif | |||
| .endif | |||
| .ifeqs "\pre_op", "v" | |||
| vpackod.d \out, \in, \in | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "d" | |||
| vpackod.w \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "w" | |||
| vpackod.h \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .ifnc "\suf_op", "h" | |||
| vpackod.b \in, \out, \out | |||
| \pre_op\()add.\suf_op \out, \out, \in | |||
| .endif | |||
| .endif | |||
| .endif | |||
| .endif | |||
| .ifnb \more | |||
| GACC \pre_op, \suf_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // GMOV | |||
| // | |||
| .macro GMOV pre_op:req, out:req, in:req, more:vararg | |||
| \pre_op\()or.v \out, \in, \in | |||
| .ifnb \more | |||
| GMOV \pre_op, \more | |||
| .endif | |||
| .endm | |||
| // | |||
| // Media Related Macros | |||
| // | |||
| .macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1 | |||
| \pre_op\()ilvl.\suf_op \out0, \in0, \in1 | |||
| \pre_op\()ilvh.\suf_op \out1, \in0, \in1 | |||
| .endm | |||
| .macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1 | |||
| \pre_op\()pickev.\suf_op \out0, \in0, \in1 | |||
| \pre_op\()pickod.\suf_op \out1, \in0, \in1 | |||
| .endm | |||
| // | |||
| // TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors, | |||
| // has no pre_op param. 128-bit vector instructions are not supported. | |||
| // | |||
| .macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ | |||
| vt0, vt1 | |||
| GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0 | |||
| GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2 | |||
| GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3 | |||
| GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02 | |||
| .endm | |||
| .macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \ | |||
| in0, in1, in2, in3, in4, in5, in6, in7, \ | |||
| tmp0, tmp1, tmp2, tmp3 | |||
| GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0 | |||
| GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1 | |||
| GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0 | |||
| GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2 | |||
| GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4 | |||
| GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5 | |||
| GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0 | |||
| GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2 | |||
| GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3 | |||
| GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \ | |||
| \out2, \out6, 0x02, \out3, \out7, 0x02, \ | |||
| \out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \ | |||
| \out6, \tmp2, 0x31, \out7, \tmp3, 0x31 | |||
| .endm | |||
| @@ -0,0 +1,463 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /********************************************************************* | |||
| * 2023/08/23 guxiwei | |||
| * UTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| *********************************************************************/ | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define S9 $r20 | |||
| #define S10 $r23 | |||
| #define S11 $r24 | |||
| #define S12 $r25 | |||
| #define S13 $r26 | |||
| #define S14 $r27 | |||
| #define S15 $r28 | |||
| #define S16 $r29 | |||
| #define TD $r30 | |||
| #define TS $r31 | |||
| #define TL $r7 | |||
| #define T0 $r6 | |||
| #undef ZERO | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| #define U4 $xr4 | |||
| #define U5 $xr5 | |||
| #define U6 $xr6 | |||
| #define U7 $xr7 | |||
| #define U8 $xr8 | |||
| #define U9 $xr9 | |||
| #define U10 $xr10 | |||
| #define U11 $xr11 | |||
| #define U12 $xr12 | |||
| #define U13 $xr13 | |||
| #define U14 $xr14 | |||
| #define U15 $xr15 | |||
| #define D0 $xr16 | |||
| #define D1 $xr17 | |||
| #define D2 $xr18 | |||
| #define D3 $xr19 | |||
| #define D4 $xr20 | |||
| #define D5 $xr21 | |||
| #define D6 $xr22 | |||
| #define D7 $xr23 | |||
| #define D8 $xr24 | |||
| #define D9 $xr25 | |||
| #define D10 $xr26 | |||
| #define D11 $xr27 | |||
| #define D12 $xr28 | |||
| #define D13 $xr29 | |||
| #define D14 $xr30 | |||
| #define D15 $xr31 | |||
| // Loops outline | |||
| //.L_N16 <------------------- | |||
| //| .L_M8: | | |||
| //| .L_M7: | Main Loop | |||
| //| .L_M1: | | |||
| //| .L_M0: --------------- | |||
| //.L_N15: | |||
| //.L_N8: | |||
| //| .L_N8_M8: | |||
| //| .L_N8_M7: | |||
| //| .L_N8_M1: | |||
| //.L_N7: | |||
| //.L_N4: | |||
| //| .L_N4_M4: | |||
| //| .L_N4_M3: | |||
| //| .L_N4_M1: | |||
| //.L_N3: | |||
| //.L_N2: | |||
| //| .L_N2_M2: | |||
| //| .L_N2_M1: | |||
| //.L_N1: | |||
| //| .L_N1_M1: | |||
| //.L_N0 | |||
| PROLOGUE | |||
| push_if_used 26, 32 | |||
| move TD, DST | |||
| move TS, SRC | |||
| PTR_SLLI TL, LDA, 0x02 | |||
| PTR_SLLI T0, TL, 0x01 | |||
| PTR_SRAI J, N, 0x04 | |||
| beq J, ZERO, .L_N15 | |||
| .align 5 | |||
| .L_N16: | |||
| move S1, TS | |||
| PTR_ADD S2, TS, TL | |||
| PTR_SRAI I, M, 0x03 | |||
| PTR_ADD S3, S2, TL | |||
| PTR_ADDI J, J, -1 | |||
| PTR_ADD S4, S3, TL | |||
| PTR_ADD S5, S3, T0 | |||
| PTR_ADD S6, S4, T0 | |||
| PTR_ADD S7, S5, T0 | |||
| PTR_ADD S8, S6, T0 | |||
| PTR_ADD S9, S7, T0 | |||
| PTR_ADD S10, S8, T0 | |||
| PTR_ADD S11, S9, T0 | |||
| PTR_ADD S12, S10, T0 | |||
| PTR_ADD S13, S11, T0 | |||
| PTR_ADD S14, S12, T0 | |||
| PTR_ADD S15, S13, T0 | |||
| PTR_ADD S16, S14, T0 | |||
| PTR_ADD TS, S15, T0 | |||
| beq I, ZERO, .L_M7 | |||
| .align 5 | |||
| .L_M8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| xvld U8, S9, 0x00 | |||
| xvld U9, S10, 0x00 | |||
| xvld U10, S11, 0x00 | |||
| xvld U11, S12, 0x00 | |||
| xvld U12, S13, 0x00 | |||
| xvld U13, S14, 0x00 | |||
| xvld U14, S15, 0x00 | |||
| xvld U15, S16, 0x00 | |||
| GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ | |||
| U0, U1, U2, U3, U4, U5, U6, U7, \ | |||
| D1, D3, D5, D7 // As tmp | |||
| GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \ | |||
| U8, U9, U10, U11, U12, U13, U14, U15, \ | |||
| U0, U1, U2, U3 // As tmp | |||
| GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \ | |||
| D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0 | |||
| PTR_ADDI TD, TD, 0x100 | |||
| GST xv, , D8, TD, 0x00, D9, TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \ | |||
| D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0 | |||
| PTR_ADDI TD, TD, 0x100 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI S3, S3, 0x20 | |||
| PTR_ADDI S4, S4, 0x20 | |||
| PTR_ADDI S5, S5, 0x20 | |||
| PTR_ADDI S6, S6, 0x20 | |||
| PTR_ADDI S7, S7, 0x20 | |||
| PTR_ADDI S8, S8, 0x20 | |||
| PTR_ADDI S9, S9, 0x20 | |||
| PTR_ADDI S10, S10, 0x20 | |||
| PTR_ADDI S11, S11, 0x20 | |||
| PTR_ADDI S12, S12, 0x20 | |||
| PTR_ADDI S13, S13, 0x20 | |||
| PTR_ADDI S14, S14, 0x20 | |||
| PTR_ADDI S15, S15, 0x20 | |||
| PTR_ADDI S16, S16, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_M8 | |||
| .L_M7: | |||
| andi I, M, 0x07 | |||
| beq I, ZERO, .L_M0 | |||
| .align 5 | |||
| .L_M1: | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S2, 0x00 | |||
| fld.s F2, S3, 0x00 | |||
| fld.s F3, S4, 0x00 | |||
| fld.s F4, S5, 0x00 | |||
| fld.s F5, S6, 0x00 | |||
| fld.s F6, S7, 0x00 | |||
| fld.s F7, S8, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| fst.s F1, TD, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| fst.s F3, TD, 0x0C | |||
| fst.s F4, TD, 0x10 | |||
| fst.s F5, TD, 0x14 | |||
| fst.s F6, TD, 0x18 | |||
| fst.s F7, TD, 0x1C | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| PTR_ADDI S4, S4, 0x04 | |||
| PTR_ADDI S5, S5, 0x04 | |||
| PTR_ADDI S6, S6, 0x04 | |||
| PTR_ADDI S7, S7, 0x04 | |||
| PTR_ADDI S8, S8, 0x04 | |||
| PTR_ADDI TD, TD, 0x20 | |||
| fld.s F0, S9, 0x00 | |||
| fld.s F1, S10, 0x00 | |||
| fld.s F2, S11, 0x00 | |||
| fld.s F3, S12, 0x00 | |||
| fld.s F4, S13, 0x00 | |||
| fld.s F5, S14, 0x00 | |||
| fld.s F6, S15, 0x00 | |||
| fld.s F7, S16, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| fst.s F1, TD, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| fst.s F3, TD, 0x0C | |||
| fst.s F4, TD, 0x10 | |||
| fst.s F5, TD, 0x14 | |||
| fst.s F6, TD, 0x18 | |||
| fst.s F7, TD, 0x1C | |||
| PTR_ADDI S9, S9, 0x04 | |||
| PTR_ADDI S10, S10, 0x04 | |||
| PTR_ADDI S11, S11, 0x04 | |||
| PTR_ADDI S12, S12, 0x04 | |||
| PTR_ADDI S13, S13, 0x04 | |||
| PTR_ADDI S14, S14, 0x04 | |||
| PTR_ADDI S15, S15, 0x04 | |||
| PTR_ADDI S16, S16, 0x04 | |||
| PTR_ADDI TD, TD, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_M1 | |||
| .L_M0: | |||
| blt ZERO, J, .L_N16 | |||
| .L_N15: | |||
| andi J, N, 0x0f | |||
| beq ZERO, J, .L_N0 | |||
| andi J, N, 0x08 | |||
| beq ZERO, J, .L_N7 | |||
| .L_N8: | |||
| move S1, TS | |||
| PTR_ADD S2, TS, TL | |||
| PTR_SRAI I, M, 0x03 | |||
| PTR_ADD S3, S2, TL | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD S5, S3, T0 | |||
| PTR_ADD S6, S4, T0 | |||
| PTR_ADD S7, S5, T0 | |||
| PTR_ADD S8, S6, T0 | |||
| PTR_ADD TS, S7, T0 | |||
| beq I, ZERO, .L_N8_M7 | |||
| .align 5 | |||
| .L_N8_M8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ | |||
| U0, U1, U2, U3, U4, U5, U6, U7, \ | |||
| D1, D3, D5, D7 // As tmp | |||
| GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ | |||
| D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 | |||
| PTR_ADDI TD, TD, 0x100 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI S3, S3, 0x20 | |||
| PTR_ADDI S4, S4, 0x20 | |||
| PTR_ADDI S5, S5, 0x20 | |||
| PTR_ADDI S6, S6, 0x20 | |||
| PTR_ADDI S7, S7, 0x20 | |||
| PTR_ADDI S8, S8, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N8_M8 | |||
| .L_N8_M7: | |||
| andi I, M, 0x07 | |||
| beq I, ZERO, .L_N7 | |||
| .align 5 | |||
| .L_N8_M1: | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S2, 0x00 | |||
| fld.s F2, S3, 0x00 | |||
| fld.s F3, S4, 0x00 | |||
| fld.s F4, S5, 0x00 | |||
| fld.s F5, S6, 0x00 | |||
| fld.s F6, S7, 0x00 | |||
| fld.s F7, S8, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F1, TD, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| fst.s F3, TD, 0x0C | |||
| PTR_ADDI S4, S4, 0x04 | |||
| fst.s F4, TD, 0x10 | |||
| PTR_ADDI S5, S5, 0x04 | |||
| fst.s F5, TD, 0x14 | |||
| PTR_ADDI S6, S6, 0x04 | |||
| fst.s F6, TD, 0x18 | |||
| PTR_ADDI S7, S7, 0x04 | |||
| fst.s F7, TD, 0x1C | |||
| PTR_ADDI S8, S8, 0x04 | |||
| PTR_ADDI TD, TD, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N8_M1 | |||
| .L_N7: | |||
| andi J, N, 0x07 | |||
| beq ZERO, J, .L_N0 | |||
| andi J, N, 0x04 | |||
| beq ZERO, J, .L_N3 | |||
| .L_N4: | |||
| move S1, TS | |||
| PTR_ADD S2, TS, TL | |||
| PTR_SRAI I, M, 0x02 | |||
| PTR_ADD S3, S2, TL | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD TS, S3, T0 | |||
| beq I, ZERO, .L_N4_M3 | |||
| .align 5 | |||
| .L_N4_M4: | |||
| GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 | |||
| GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 | |||
| GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 | |||
| GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 | |||
| GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 | |||
| GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI S3, S3, 0x10 | |||
| PTR_ADDI S4, S4, 0x10 | |||
| PTR_ADDI TD, TD, 0x40 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N4_M4 | |||
| .L_N4_M3: | |||
| andi I, M, 0x03 | |||
| beq I, ZERO, .L_N3 | |||
| .align 5 | |||
| .L_N4_M1: | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S2, 0x00 | |||
| fld.s F2, S3, 0x00 | |||
| fld.s F3, S4, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F1, TD, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| fst.s F3, TD, 0x0C | |||
| PTR_ADDI S4, S4, 0x04 | |||
| PTR_ADDI TD, TD, 0x10 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N4_M1 | |||
| .L_N3: | |||
| andi J, N, 0x03 | |||
| beq ZERO, J, .L_N0 | |||
| andi J, N, 0x02 | |||
| beq ZERO, J, .L_N1 | |||
| .L_N2: | |||
| move S1, TS | |||
| PTR_ADD S2, TS, TL | |||
| PTR_SRAI I, M, 0x01 | |||
| PTR_ADD TS, S2, TL | |||
| beq I, ZERO, .L_N2_M1 | |||
| .align 5 | |||
| .L_N2_M2: | |||
| GLD f, d, F0, S1, 0x00, F1, S2, 0x00 | |||
| vilvl.w $vr0, $vr1, $vr0 | |||
| GST v, , $vr0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI TD, TD, 0x10 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N2_M2 | |||
| .L_N2_M1: | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_N1 | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S2, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F1, TD, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI TD, TD, 0x08 | |||
| .align 5 | |||
| .L_N1: | |||
| move S1, TS | |||
| beq ZERO, M, .L_N0 | |||
| .L_N1_M1: | |||
| fld.s F0, S1, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI TD, TD, 0x04 | |||
| PTR_ADDI M, M, -1 | |||
| blt ZERO, M, .L_N1_M1 | |||
| .L_N0: | |||
| pop_if_used 26, 32 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -0,0 +1,298 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /********************************************************************* | |||
| * 2023/08/23 guxiwei | |||
| * UTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| *********************************************************************/ | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define TD $r20 | |||
| #define TS $r11 | |||
| #define TL $r7 | |||
| #define T0 $r6 | |||
| #undef ZERO | |||
| #define ZERO $r0 | |||
| #define F0 $f0 | |||
| #define F1 $f1 | |||
| #define F2 $f2 | |||
| #define F3 $f3 | |||
| #define F4 $f4 | |||
| #define F5 $f5 | |||
| #define F6 $f6 | |||
| #define F7 $f7 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| #define U4 $xr4 | |||
| #define U5 $xr5 | |||
| #define U6 $xr6 | |||
| #define U7 $xr7 | |||
| #define D0 $xr8 | |||
| #define D1 $xr9 | |||
| #define D2 $xr10 | |||
| #define D3 $xr11 | |||
| #define D4 $xr12 | |||
| #define D5 $xr13 | |||
| #define D6 $xr14 | |||
| #define D7 $xr15 | |||
| #define D8 $xr16 | |||
| #define D10 $xr17 | |||
| #define D12 $xr18 | |||
| #define D14 $xr19 | |||
| // Loops outline | |||
| //.L_N8: <---------------- | |||
| //| .L_M8: | | |||
| //| .L_M7: | Main Loop | |||
| //| .L_M1: | | |||
| //| .L_M0:-------------- | |||
| //.L_N7: | |||
| //.L_N4: | |||
| //| .L_N4_M4: | |||
| //| .L_N4_M3: | |||
| //| .L_N4_M1: | |||
| //.L_N3: | |||
| //.L_N2: | |||
| //| .L_N2_M2: | |||
| //| .L_N2_M1: | |||
| //.L_N1: | |||
| //| .L_N1_M1: | |||
| //.L_N0 | |||
| PROLOGUE | |||
| push_if_used 17, 20 | |||
| move TD, DST | |||
| move TS, SRC | |||
| PTR_SLLI TL, LDA, 0x02 | |||
| PTR_SLLI T0, TL, 0x01 | |||
| PTR_SRAI J, N, 0x03 | |||
| beq J, ZERO, .L_N7 | |||
| .align 5 | |||
| .L_N8: | |||
| move S1, TS | |||
| PTR_ADD S2, TS, TL | |||
| PTR_SRAI I, M, 0x03 | |||
| PTR_ADD S3, S2, TL | |||
| PTR_ADDI J, J, -1 | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD S5, S3, T0 | |||
| PTR_ADD S6, S4, T0 | |||
| PTR_ADD S7, S5, T0 | |||
| PTR_ADD S8, S6, T0 | |||
| PTR_ADD TS, S7, T0 | |||
| beq I, ZERO, .L_M7 | |||
| .align 5 | |||
| .L_M8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ | |||
| U0, U1, U2, U3, U4, U5, U6, U7, \ | |||
| D1, D3, D5, D7 // As tmp | |||
| GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ | |||
| D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 | |||
| PTR_ADDI TD, TD, 0x100 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI S3, S3, 0x20 | |||
| PTR_ADDI S4, S4, 0x20 | |||
| PTR_ADDI S5, S5, 0x20 | |||
| PTR_ADDI S6, S6, 0x20 | |||
| PTR_ADDI S7, S7, 0x20 | |||
| PTR_ADDI S8, S8, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_M8 | |||
| .L_M7: | |||
| andi I, M, 0x07 | |||
| beq I, ZERO, .L_M0 | |||
| .align 5 | |||
| .L_M1: | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S2, 0x00 | |||
| fld.s F2, S3, 0x00 | |||
| fld.s F3, S4, 0x00 | |||
| fld.s F4, S5, 0x00 | |||
| fld.s F5, S6, 0x00 | |||
| fld.s F6, S7, 0x00 | |||
| fld.s F7, S8, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F1, TD, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| fst.s F3, TD, 0x0C | |||
| PTR_ADDI S4, S4, 0x04 | |||
| fst.s F4, TD, 0x10 | |||
| PTR_ADDI S5, S5, 0x04 | |||
| fst.s F5, TD, 0x14 | |||
| PTR_ADDI S6, S6, 0x04 | |||
| fst.s F6, TD, 0x18 | |||
| PTR_ADDI S7, S7, 0x04 | |||
| fst.s F7, TD, 0x1C | |||
| PTR_ADDI S8, S8, 0x04 | |||
| PTR_ADDI TD, TD, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_M1 | |||
| .L_M0: | |||
| blt ZERO, J, .L_N8 | |||
| .L_N7: | |||
| andi J, N, 0x07 | |||
| beq ZERO, J, .L_N0 | |||
| andi J, N, 0x04 | |||
| beq ZERO, J, .L_N3 | |||
| .L_N4: | |||
| move S1, TS | |||
| PTR_ADD S2, TS, TL | |||
| PTR_SRAI I, M, 0x02 | |||
| PTR_ADD S3, S2, TL | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD TS, S3, T0 | |||
| beq I, ZERO, .L_N4_M3 | |||
| .align 5 | |||
| .L_N4_M4: | |||
| GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 | |||
| GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 | |||
| GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 | |||
| GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 | |||
| GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 | |||
| GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI S3, S3, 0x10 | |||
| PTR_ADDI S4, S4, 0x10 | |||
| PTR_ADDI TD, TD, 0x40 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N4_M4 | |||
| .L_N4_M3: | |||
| andi I, M, 0x03 | |||
| beq I, ZERO, .L_N3 | |||
| .align 5 | |||
| .L_N4_M1: | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S2, 0x00 | |||
| fld.s F2, S3, 0x00 | |||
| fld.s F3, S4, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F1, TD, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| fst.s F2, TD, 0x08 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| fst.s F3, TD, 0x0C | |||
| PTR_ADDI S4, S4, 0x04 | |||
| PTR_ADDI TD, TD, 0x10 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N4_M1 | |||
| .L_N3: | |||
| andi J, N, 0x03 | |||
| beq ZERO, J, .L_N0 | |||
| andi J, N, 0x02 | |||
| beq ZERO, J, .L_N1 | |||
| .L_N2: | |||
| move S1, TS | |||
| PTR_ADD S2, TS, TL | |||
| PTR_SRAI I, M, 0x01 | |||
| PTR_ADD TS, S2, TL | |||
| beq I, ZERO, .L_N2_M1 | |||
| .align 5 | |||
| .L_N2_M2: | |||
| GLD f, d, F0, S1, 0x00, F1, S2, 0x00 | |||
| vilvl.w $vr0, $vr1, $vr0 | |||
| GST v, , $vr0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI TD, TD, 0x10 | |||
| PTR_ADDI I, I, -1 | |||
| blt ZERO, I, .L_N2_M2 | |||
| .L_N2_M1: | |||
| andi I, M, 0x01 | |||
| beq I, ZERO, .L_N1 | |||
| fld.s F0, S1, 0x00 | |||
| fld.s F1, S2, 0x00 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F1, TD, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI TD, TD, 0x08 | |||
| .align 5 | |||
| .L_N1: | |||
| move S1, TS | |||
| beq ZERO, M, .L_N0 | |||
| .L_N1_M1: | |||
| fld.s F0, S1, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| fst.s F0, TD, 0x00 | |||
| PTR_ADDI TD, TD, 0x04 | |||
| PTR_ADDI M, M, -1 | |||
| blt ZERO, M, .L_N1_M1 | |||
| .L_N0: | |||
| pop_if_used 17, 20 | |||
| jirl $r0, $r1, 0x0 | |||
| EPILOGUE | |||
| @@ -0,0 +1,526 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /********************************************************************* | |||
| * 2023/08/23 guxiwei | |||
| * UTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| *********************************************************************/ | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S0 $r11 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define P0 $r20 | |||
| #define P1 $r23 | |||
| #define P2 $r24 | |||
| #define P3 $r25 | |||
| #define P4 $r26 | |||
| #define P5 $r27 | |||
| #define T0 $r28 | |||
| #define T1 $r29 | |||
| #define TL $r7 | |||
| #define ZERO $r0 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| #define U4 $xr4 | |||
| #define U5 $xr5 | |||
| #define U6 $xr6 | |||
| #define U7 $xr7 | |||
| // Loops outline | |||
| //.L_M8 <------------------- | |||
| //| .L_N16: | | |||
| //| .L_N15: | | |||
| //| .L_N8: | | |||
| //| .L_N7: | Main Loop | |||
| //| .L_N4: | | |||
| //| .L_N3: | | |||
| //| .L_N2: | | |||
| //| .L_N1: | | |||
| //| .L_N0: --------------- | |||
| //.L_M7 | |||
| //.L_M4 | |||
| //| .L_M4_N16: | |||
| //| .L_M4_N15: | |||
| //| .L_M4_N8: | |||
| //| .L_M4_N7: | |||
| //| .L_M4_N4: | |||
| //| .L_M4_N3: | |||
| //| .L_M4_N2: | |||
| //| .L_M4_N1: | |||
| //.L_M3 | |||
| //.L_M2 | |||
| //| .L_M2_N16: | |||
| //| .L_M2_N15: | |||
| //| .L_M2_N8: | |||
| //| .L_M2_N7: | |||
| //| .L_M2_N4: | |||
| //| .L_M2_N3: | |||
| //| .L_M2_N2: | |||
| //| .L_M2_N1: | |||
| //.L_M1 | |||
| //| .L_M1_N16: | |||
| //| .L_M1_N15: | |||
| //| .L_M1_N8: | |||
| //| .L_M1_N7: | |||
| //| .L_M1_N4: | |||
| //| .L_M1_N3: | |||
| //| .L_M1_N2: | |||
| //| .L_M1_N1: | |||
| //.L_M0 | |||
| PROLOGUE | |||
| push_if_used 24, 8 | |||
| move S0, SRC | |||
| move P0, DST | |||
| PTR_SRAI T0, N, 0x04 | |||
| PTR_SRAI T1, N, 0x03 | |||
| PTR_SLLI T0, T0, 0x04 | |||
| PTR_SLLI T1, T1, 0x03 | |||
| PTR_MUL P2, M, T0 | |||
| PTR_MUL P3, M, T1 | |||
| PTR_SLLI P2, P2, 0x02 | |||
| PTR_SLLI P3, P3, 0x02 | |||
| PTR_ADD P2, DST, P2 | |||
| PTR_ADD P3, DST, P3 | |||
| PTR_SRAI T0, N, 0x02 | |||
| PTR_SRAI T1, N, 0x01 | |||
| PTR_SLLI T0, T0, 0x02 | |||
| PTR_SLLI T1, T1, 0x01 | |||
| PTR_MUL P4, M, T0 | |||
| PTR_MUL P5, M, T1 | |||
| PTR_SLLI P4, P4, 0x02 | |||
| PTR_SLLI P5, P5, 0x02 | |||
| PTR_ADD P4, DST, P4 | |||
| PTR_ADD P5, DST, P5 | |||
| PTR_SLLI TL, LDA, 0x02 | |||
| PTR_SRAI J, M, 0x03 | |||
| PTR_SLLI T0, TL, 0x01 | |||
| PTR_SLLI T1, M, 0x06 | |||
| beq ZERO, J, .L_M7 | |||
| .align 5 | |||
| .L_M8: | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| PTR_ADD S3, S1, T0 | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD S5, S3, T0 | |||
| PTR_ADD S6, S4, T0 | |||
| PTR_ADD S7, S5, T0 | |||
| PTR_ADD S8, S6, T0 | |||
| PTR_ADD S0, S7, T0 | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x200 | |||
| PTR_SRAI I, N, 0x04 | |||
| PTR_ADDI J, J, -1 | |||
| beq ZERO, I, .L_N15 | |||
| .L_N16: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S2, 0x00 | |||
| xvld U3, S2, 0x20 | |||
| xvst U0, P1, 0x00 | |||
| xvst U1, P1, 0x20 | |||
| xvst U2, P1, 0x40 | |||
| xvst U3, P1, 0x60 | |||
| xvld U4, S3, 0x00 | |||
| xvld U5, S3, 0x20 | |||
| xvld U6, S4, 0x00 | |||
| xvld U7, S4, 0x20 | |||
| xvst U4, P1, 0x80 | |||
| xvst U5, P1, 0xA0 | |||
| xvst U6, P1, 0xC0 | |||
| xvst U7, P1, 0xE0 | |||
| xvld U0, S5, 0x00 | |||
| xvld U1, S5, 0x20 | |||
| xvld U2, S6, 0x00 | |||
| xvld U3, S6, 0x20 | |||
| xvst U0, P1, 0x100 | |||
| xvst U1, P1, 0x120 | |||
| xvst U2, P1, 0x140 | |||
| xvst U3, P1, 0x160 | |||
| xvld U4, S7, 0x00 | |||
| xvld U5, S7, 0x20 | |||
| xvld U6, S8, 0x00 | |||
| xvld U7, S8, 0x20 | |||
| xvst U4, P1, 0x180 | |||
| xvst U5, P1, 0x1A0 | |||
| xvst U6, P1, 0x1C0 | |||
| xvst U7, P1, 0x1E0 | |||
| PTR_ADDI S1, S1, 0x40 | |||
| PTR_ADDI S2, S2, 0x40 | |||
| PTR_ADDI S3, S3, 0x40 | |||
| PTR_ADDI S4, S4, 0x40 | |||
| PTR_ADDI S5, S5, 0x40 | |||
| PTR_ADDI S6, S6, 0x40 | |||
| PTR_ADDI S7, S7, 0x40 | |||
| PTR_ADDI S8, S8, 0x40 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_N16 | |||
| .L_N15: | |||
| andi I, N, 0x08 | |||
| beq ZERO, I, .L_N7 | |||
| .L_N8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \ | |||
| U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI S3, S3, 0x20 | |||
| PTR_ADDI S4, S4, 0x20 | |||
| PTR_ADDI S5, S5, 0x20 | |||
| PTR_ADDI S6, S6, 0x20 | |||
| PTR_ADDI S7, S7, 0x20 | |||
| PTR_ADDI S8, S8, 0x20 | |||
| PTR_ADDI P2, P2, 0x100 | |||
| .L_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_N3 | |||
| .L_N4: | |||
| GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ | |||
| $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 | |||
| GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \ | |||
| $vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI S3, S3, 0x10 | |||
| PTR_ADDI S4, S4, 0x10 | |||
| PTR_ADDI S5, S5, 0x10 | |||
| PTR_ADDI S6, S6, 0x10 | |||
| PTR_ADDI S7, S7, 0x10 | |||
| PTR_ADDI S8, S8, 0x10 | |||
| PTR_ADDI P3, P3, 0x80 | |||
| .L_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_N1 | |||
| .L_N2: | |||
| GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ | |||
| $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 | |||
| GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \ | |||
| $f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI S3, S3, 0x08 | |||
| PTR_ADDI S4, S4, 0x08 | |||
| PTR_ADDI S5, S5, 0x08 | |||
| PTR_ADDI S6, S6, 0x08 | |||
| PTR_ADDI S7, S7, 0x08 | |||
| PTR_ADDI S8, S8, 0x08 | |||
| PTR_ADDI P4, P4, 0x40 | |||
| .L_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_N0 | |||
| GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ | |||
| $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 | |||
| GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \ | |||
| $f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| PTR_ADDI S4, S4, 0x04 | |||
| PTR_ADDI S5, S5, 0x04 | |||
| PTR_ADDI S6, S6, 0x04 | |||
| PTR_ADDI S7, S7, 0x04 | |||
| PTR_ADDI S8, S8, 0x04 | |||
| PTR_ADDI P5, P5, 0x20 | |||
| .L_N0: | |||
| blt ZERO, J, .L_M8 | |||
| .L_M7: | |||
| andi J, M, 0x04 | |||
| beq ZERO, J, .L_M3 | |||
| .L_M4: | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| PTR_ADD S3, S1, T0 | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD S0, S3, T0 | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x100 | |||
| PTR_SRAI I, N, 0x04 | |||
| beq ZERO, I, .L_M4_N15 | |||
| .align 5 | |||
| .L_M4_N16: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S2, 0x00 | |||
| xvld U3, S2, 0x20 | |||
| xvst U0, P1, 0x00 | |||
| xvst U1, P1, 0x20 | |||
| xvst U2, P1, 0x40 | |||
| xvst U3, P1, 0x60 | |||
| xvld U4, S3, 0x00 | |||
| xvld U5, S3, 0x20 | |||
| xvld U6, S4, 0x00 | |||
| xvld U7, S4, 0x20 | |||
| xvst U4, P1, 0x80 | |||
| xvst U5, P1, 0xA0 | |||
| xvst U6, P1, 0xC0 | |||
| xvst U7, P1, 0xE0 | |||
| PTR_ADDI S1, S1, 0x40 | |||
| PTR_ADDI S2, S2, 0x40 | |||
| PTR_ADDI S3, S3, 0x40 | |||
| PTR_ADDI S4, S4, 0x40 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_M4_N16 | |||
| .L_M4_N15: | |||
| andi I, N, 0x08 | |||
| beq ZERO, I, .L_M4_N7 | |||
| .L_M4_N8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI S3, S3, 0x20 | |||
| PTR_ADDI S4, S4, 0x20 | |||
| PTR_ADDI P2, P2, 0x80 | |||
| .L_M4_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_M4_N3 | |||
| .L_M4_N4: | |||
| GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 | |||
| GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI S3, S3, 0x10 | |||
| PTR_ADDI S4, S4, 0x10 | |||
| PTR_ADDI P3, P3, 0x40 | |||
| .L_M4_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M4_N1 | |||
| .L_M4_N2: | |||
| GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 | |||
| GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI S3, S3, 0x08 | |||
| PTR_ADDI S4, S4, 0x08 | |||
| PTR_ADDI P4, P4, 0x20 | |||
| .L_M4_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M3 | |||
| GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 | |||
| GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| PTR_ADDI S4, S4, 0x04 | |||
| PTR_ADDI P5, P5, 0x10 | |||
| .L_M3: | |||
| andi J, M, 0x02 | |||
| beq ZERO, J, .L_M1 | |||
| .L_M2: | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| PTR_ADD S0, S0, T0 | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x80 | |||
| PTR_SRAI I, N, 0x04 | |||
| beq ZERO, I, .L_M2_N15 | |||
| .align 5 | |||
| .L_M2_N16: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvld U2, S2, 0x00 | |||
| xvld U3, S2, 0x20 | |||
| xvst U0, P1, 0x00 | |||
| xvst U1, P1, 0x20 | |||
| xvst U2, P1, 0x40 | |||
| xvst U3, P1, 0x60 | |||
| PTR_ADDI S1, S1, 0x40 | |||
| PTR_ADDI S2, S2, 0x40 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_M2_N16 | |||
| .L_M2_N15: | |||
| andi I, N, 0x08 | |||
| beq ZERO, I, .L_M2_N7 | |||
| .L_M2_N8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| GST xv, , U0, P2, 0x00, U1, P2, 0x20 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI P2, P2, 0x40 | |||
| .L_M2_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_M2_N3 | |||
| .L_M2_N4: | |||
| GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 | |||
| GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI P3, P3, 0x20 | |||
| .L_M2_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M2_N1 | |||
| .L_M2_N2: | |||
| GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 | |||
| GST f, d, $f0, P4, 0x00, $f1, P4, 0x08 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI P4, P4, 0x10 | |||
| .L_M2_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M1 | |||
| GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 | |||
| GST f, s, $f0, P5, 0x00, $f1, P5, 0x04 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI P5, P5, 0x08 | |||
| .L_M1: | |||
| andi J, M, 0x01 | |||
| beq ZERO, J, .L_M0 | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x40 | |||
| PTR_SRAI I, N, 0x04 | |||
| beq ZERO, I, .L_M1_N15 | |||
| .align 5 | |||
| .L_M1_N16: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S1, 0x20 | |||
| xvst U0, P1, 0x00 | |||
| xvst U1, P1, 0x20 | |||
| PTR_ADDI S1, S1, 0x40 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_M1_N16 | |||
| .L_M1_N15: | |||
| andi I, N, 0x08 | |||
| beq ZERO, I, .L_M1_N7 | |||
| .L_M1_N8: | |||
| xvld U0, S1, 0x00 | |||
| GST xv, , U0, P2, 0x00 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI P2, P2, 0x20 | |||
| .L_M1_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_M1_N3 | |||
| .L_M1_N4: | |||
| GLD v, , $vr0, S1, 0x00 | |||
| GST v, , $vr0, P3, 0x00 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI P3, P3, 0x10 | |||
| .L_M1_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M1_N1 | |||
| .L_M1_N2: | |||
| GLD f, d, $f0, S1, 0x00 | |||
| GST f, d, $f0, P4, 0x00 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI P4, P4, 0x08 | |||
| .L_M1_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M0 | |||
| GLD f, s, $f0, S1, 0x00 | |||
| GST f, s, $f0, P5, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI P5, P5, 0x04 | |||
| .L_M0: | |||
| pop_if_used 24, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -0,0 +1,406 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2023, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "loongarch64_asm.S" | |||
| /********************************************************************* | |||
| * 2023/08/23 guxiwei | |||
| * UTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| *********************************************************************/ | |||
| /* Function parameters */ | |||
| #define M $r4 // param 1: m | |||
| #define N $r5 // param 2: n | |||
| #define SRC $r6 // param 3: src | |||
| #define LDA $r7 // param 4: lda | |||
| #define DST $r8 // param 5: dst | |||
| #define I $r9 | |||
| #define J $r10 | |||
| #define S0 $r11 | |||
| #define S1 $r12 | |||
| #define S2 $r13 | |||
| #define S3 $r14 | |||
| #define S4 $r15 | |||
| #define S5 $r16 | |||
| #define S6 $r17 | |||
| #define S7 $r18 | |||
| #define S8 $r19 | |||
| #define P0 $r20 | |||
| #define P1 $r23 | |||
| #define P2 $r24 | |||
| #define P3 $r25 | |||
| #define P4 $r26 | |||
| #define T0 $r27 | |||
| #define T1 $r28 | |||
| #define TL $r7 | |||
| #undef ZERO | |||
| #define ZERO $r0 | |||
| /* LASX vectors */ | |||
| #define U0 $xr0 | |||
| #define U1 $xr1 | |||
| #define U2 $xr2 | |||
| #define U3 $xr3 | |||
| #define U4 $xr4 | |||
| #define U5 $xr5 | |||
| #define U6 $xr6 | |||
| #define U7 $xr7 | |||
| // Loops outline | |||
| //.L_M8 <------------------- | |||
| //| .L_N8: | | |||
| //| .L_N7: | Main Loop | |||
| //| .L_N4: | | |||
| //| .L_N3: | | |||
| //| .L_N2: | | |||
| //| .L_N1: | | |||
| //| .L_N0: --------------- | |||
| //.L_M7 | |||
| //.L_M4 | |||
| //| .L_M4_N8: | |||
| //| .L_M4_N7: | |||
| //| .L_M4_N4: | |||
| //| .L_M4_N3: | |||
| //| .L_M4_N2: | |||
| //| .L_M4_N1: | |||
| //.L_M3 | |||
| //.L_M2 | |||
| //| .L_M2_N8: | |||
| //| .L_M2_N7: | |||
| //| .L_M2_N4: | |||
| //| .L_M2_N3: | |||
| //| .L_M2_N2: | |||
| //| .L_M2_N1: | |||
| //.L_M1 | |||
| //| .L_M1_N8: | |||
| //| .L_M1_N7: | |||
| //| .L_M1_N4: | |||
| //| .L_M1_N3: | |||
| //| .L_M1_N2: | |||
| //| .L_M1_N1: | |||
| //.L_M0 | |||
| PROLOGUE | |||
| push_if_used 23, 8 | |||
| move S0, SRC | |||
| move P0, DST | |||
| PTR_SRAI T0, N, 0x04 | |||
| PTR_SRAI T1, N, 0x03 | |||
| PTR_SLLI T0, T0, 0x04 | |||
| PTR_SLLI T1, T1, 0x03 | |||
| PTR_MUL P2, M, T1 | |||
| PTR_SLLI P2, P2, 0x02 | |||
| PTR_ADD P2, DST, P2 | |||
| PTR_SRAI T0, N, 0x02 | |||
| PTR_SRAI T1, N, 0x01 | |||
| PTR_SLLI T0, T0, 0x02 | |||
| PTR_SLLI T1, T1, 0x01 | |||
| PTR_MUL P3, M, T0 | |||
| PTR_MUL P4, M, T1 | |||
| PTR_SLLI P3, P3, 0x02 | |||
| PTR_SLLI P4, P4, 0x02 | |||
| PTR_ADD P3, DST, P3 | |||
| PTR_ADD P4, DST, P4 | |||
| PTR_SLLI TL, LDA, 0x02 | |||
| PTR_SRAI J, M, 0x03 | |||
| PTR_SLLI T0, TL, 0x01 | |||
| PTR_SLLI T1, M, 0x05 | |||
| beq ZERO, J, .L_M7 | |||
| .align 5 | |||
| .L_M8: | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| PTR_ADD S3, S1, T0 | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD S5, S3, T0 | |||
| PTR_ADD S6, S4, T0 | |||
| PTR_ADD S7, S5, T0 | |||
| PTR_ADD S8, S6, T0 | |||
| PTR_ADD S0, S7, T0 | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x100 | |||
| PTR_SRAI I, N, 0x03 | |||
| PTR_ADDI J, J, -1 | |||
| beq ZERO, I, .L_N7 | |||
| .L_N8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| xvld U4, S5, 0x00 | |||
| xvld U5, S6, 0x00 | |||
| xvld U6, S7, 0x00 | |||
| xvld U7, S8, 0x00 | |||
| GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \ | |||
| U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI S3, S3, 0x20 | |||
| PTR_ADDI S4, S4, 0x20 | |||
| PTR_ADDI S5, S5, 0x20 | |||
| PTR_ADDI S6, S6, 0x20 | |||
| PTR_ADDI S7, S7, 0x20 | |||
| PTR_ADDI S8, S8, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_N8 | |||
| .L_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_N3 | |||
| .L_N4: | |||
| GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ | |||
| $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 | |||
| GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \ | |||
| $vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI S3, S3, 0x10 | |||
| PTR_ADDI S4, S4, 0x10 | |||
| PTR_ADDI S5, S5, 0x10 | |||
| PTR_ADDI S6, S6, 0x10 | |||
| PTR_ADDI S7, S7, 0x10 | |||
| PTR_ADDI S8, S8, 0x10 | |||
| PTR_ADDI P2, P2, 0x80 | |||
| .L_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_N1 | |||
| .L_N2: | |||
| GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ | |||
| $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 | |||
| GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \ | |||
| $f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI S3, S3, 0x08 | |||
| PTR_ADDI S4, S4, 0x08 | |||
| PTR_ADDI S5, S5, 0x08 | |||
| PTR_ADDI S6, S6, 0x08 | |||
| PTR_ADDI S7, S7, 0x08 | |||
| PTR_ADDI S8, S8, 0x08 | |||
| PTR_ADDI P3, P3, 0x40 | |||
| .L_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_N0 | |||
| GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ | |||
| $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 | |||
| GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \ | |||
| $f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| PTR_ADDI S4, S4, 0x04 | |||
| PTR_ADDI S5, S5, 0x04 | |||
| PTR_ADDI S6, S6, 0x04 | |||
| PTR_ADDI S7, S7, 0x04 | |||
| PTR_ADDI S8, S8, 0x04 | |||
| PTR_ADDI P4, P4, 0x20 | |||
| .L_N0: | |||
| blt ZERO, J, .L_M8 | |||
| .L_M7: | |||
| andi J, M, 0x04 | |||
| beq ZERO, J, .L_M3 | |||
| .L_M4: | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| PTR_ADD S3, S1, T0 | |||
| PTR_ADD S4, S2, T0 | |||
| PTR_ADD S0, S3, T0 | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x80 | |||
| PTR_SRAI I, N, 0x03 | |||
| beq ZERO, I, .L_M4_N7 | |||
| .align 5 | |||
| .L_M4_N8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| xvld U2, S3, 0x00 | |||
| xvld U3, S4, 0x00 | |||
| GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI S3, S3, 0x20 | |||
| PTR_ADDI S4, S4, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_M4_N8 | |||
| .L_M4_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_M4_N3 | |||
| .L_M4_N4: | |||
| GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 | |||
| GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI S3, S3, 0x10 | |||
| PTR_ADDI S4, S4, 0x10 | |||
| PTR_ADDI P2, P2, 0x40 | |||
| .L_M4_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M4_N1 | |||
| .L_M4_N2: | |||
| GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 | |||
| GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI S3, S3, 0x08 | |||
| PTR_ADDI S4, S4, 0x08 | |||
| PTR_ADDI P3, P3, 0x20 | |||
| .L_M4_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M3 | |||
| GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 | |||
| GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI S3, S3, 0x04 | |||
| PTR_ADDI S4, S4, 0x04 | |||
| PTR_ADDI P4, P4, 0x10 | |||
| .L_M3: | |||
| andi J, M, 0x02 | |||
| beq ZERO, J, .L_M1 | |||
| .L_M2: | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| PTR_ADD S0, S0, T0 | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x40 | |||
| PTR_SRAI I, N, 0x03 | |||
| beq ZERO, I, .L_M2_N7 | |||
| .align 5 | |||
| .L_M2_N8: | |||
| xvld U0, S1, 0x00 | |||
| xvld U1, S2, 0x00 | |||
| GST xv, , U0, P1, 0x00, U1, P1, 0x20 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI S2, S2, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_M2_N8 | |||
| .L_M2_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_M2_N3 | |||
| .L_M2_N4: | |||
| GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 | |||
| GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI S2, S2, 0x10 | |||
| PTR_ADDI P2, P2, 0x20 | |||
| .L_M2_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M2_N1 | |||
| .L_M2_N2: | |||
| GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 | |||
| GST f, d, $f0, P3, 0x00, $f1, P3, 0x08 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI S2, S2, 0x08 | |||
| PTR_ADDI P3, P3, 0x10 | |||
| .L_M2_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M1 | |||
| GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 | |||
| GST f, s, $f0, P4, 0x00, $f1, P4, 0x04 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI S2, S2, 0x04 | |||
| PTR_ADDI P4, P4, 0x08 | |||
| .L_M1: | |||
| andi J, M, 0x01 | |||
| beq ZERO, J, .L_M0 | |||
| move S1, S0 | |||
| PTR_ADD S2, S0, TL | |||
| move P1, P0 | |||
| PTR_ADDI P0, P0, 0x20 | |||
| PTR_SRAI I, N, 0x03 | |||
| beq ZERO, I, .L_M1_N7 | |||
| .align 5 | |||
| .L_M1_N8: | |||
| xvld U0, S1, 0x00 | |||
| GST xv, , U0, P1, 0x00 | |||
| PTR_ADDI S1, S1, 0x20 | |||
| PTR_ADDI I, I, -1 | |||
| PTR_ADD P1, P1, T1 | |||
| blt ZERO, I, .L_M1_N8 | |||
| .L_M1_N7: | |||
| andi I, N, 0x04 | |||
| beq ZERO, I, .L_M1_N3 | |||
| .L_M1_N4: | |||
| GLD v, , $vr0, S1, 0x00 | |||
| GST v, , $vr0, P2, 0x00 | |||
| PTR_ADDI S1, S1, 0x10 | |||
| PTR_ADDI P2, P2, 0x10 | |||
| .L_M1_N3: | |||
| andi I, N, 0x02 | |||
| beq ZERO, I, .L_M1_N1 | |||
| .L_M1_N2: | |||
| GLD f, d, $f0, S1, 0x00 | |||
| GST f, d, $f0, P3, 0x00 | |||
| PTR_ADDI S1, S1, 0x08 | |||
| PTR_ADDI P3, P3, 0x08 | |||
| .L_M1_N1: | |||
| andi I, N, 0x01 | |||
| beq ZERO, I, .L_M0 | |||
| GLD f, s, $f0, S1, 0x00 | |||
| GST f, s, $f0, P4, 0x00 | |||
| PTR_ADDI S1, S1, 0x04 | |||
| PTR_ADDI P4, P4, 0x04 | |||
| .L_M0: | |||
| pop_if_used 23, 8 | |||
| jirl $r0, $r1, 0x00 | |||
| EPILOGUE | |||
| @@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmov.d s2, s1 | |||
| bge $r0, N, .L999 | |||
| slli.d INCX, INCX, BASE_SHIFT | |||
| bge $r0, INCX, .L999 | |||
| beq $r0, INCX, .L999 | |||
| srai.d I, N, 3 | |||
| bne INCX, TEMP, .L20 | |||
| bge $r0, I, .L15 | |||
| @@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| MTC s1, $r0 | |||
| bge $r0, N, .L999 | |||
| slli.d INCX, INCX, ZBASE_SHIFT | |||
| bge $r0, INCX, .L999 | |||
| beq $r0, INCX, .L999 | |||
| move XX, X | |||
| MOV s2, s1 | |||
| srai.d I, N, 2 | |||
| @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| FLOAT absxi = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||
| if (n <= 0 || inc_x == 0) return(0.0); | |||
| if ( n == 1 ) return( ABS(x[0]) ); | |||
| n *= inc_x; | |||
| @@ -48,7 +48,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| BLASLONG inc_x2; | |||
| FLOAT temp; | |||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||
| if (n <= 0 || inc_x == 0) return(0.0); | |||
| inc_x2 = 2 * inc_x; | |||
| @@ -77,7 +77,7 @@ | |||
| blez N, .L999 | |||
| mov.d s2, s1 | |||
| blez INCX, .L999 | |||
| beqz INCX, .L999 | |||
| dsll INCX, INCX, ZBASE_SHIFT | |||
| dsra I, N, 2 | |||
| @@ -81,7 +81,7 @@ | |||
| blez N, .L999 | |||
| MTC $0, s1 | |||
| blez INCX, .L999 | |||
| beqz INCX, .L999 | |||
| dsll INCX, INCX, BASE_SHIFT | |||
| move XX, X | |||
| @@ -77,7 +77,7 @@ | |||
| blez N, .L999 | |||
| mov.d s2, s1 | |||
| blez INCX, .L999 | |||
| beqz INCX, .L999 | |||
| dsll INCX, INCX, BASE_SHIFT | |||
| bne INCX, TEMP, .L20 | |||
| @@ -80,7 +80,7 @@ | |||
| blez N, .L999 | |||
| MTC $0, s1 | |||
| blez INCX, .L999 | |||
| beqz INCX, .L999 | |||
| dsll INCX, INCX, ZBASE_SHIFT | |||
| move XX, X | |||
| @@ -99,7 +99,7 @@ | |||
| cmpwi cr0, N, 0 | |||
| ble- LL(9999) | |||
| cmpwi cr0, INCX, 0 | |||
| ble- LL(9999) | |||
| beq- LL(9999) | |||
| fmr f0, f1 | |||
| fmr f2, f1 | |||
| @@ -119,7 +119,7 @@ | |||
| cmpwi cr0, N, 0 | |||
| ble LL(99) | |||
| cmpwi cr0, INCX, 0 | |||
| ble LL(99) | |||
| beq LL(99) | |||
| andi. r0, X, 2 * SIZE - 1 | |||
| bne LL(100) | |||
| @@ -104,7 +104,7 @@ | |||
| cmpwi cr0, N, 0 | |||
| ble- LL(999) | |||
| cmpwi cr0, INCX, 0 | |||
| ble- LL(999) | |||
| beq- LL(999) | |||
| fmr f0, f1 | |||
| sub X, X, INCX | |||